Remove GTest dependency, add code coverage, and refactor unit tests and CI pipelines (#744)

- Removes the GTest dependency, replacing it with a minimal custom framework (`test/framework.*`) that covers only what the tests actually use — a unified `TEST()` macro with SFINAE-based fixture auto-detection, `EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown. - `--exclude-perf-tests` flag and substring-based negative filtering - `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to Codecov - Merges standalone `test/perf/` into main test targets - Refactors Azure pipelines to reduce redundancies & make more readable --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2026-04-19 22:39:11 +00:00 · 2026-03-24 23:34:38 -04:00
parent 5d18835417
commit 93f6eeaa6b
68 changed files with 2116 additions and 2416 deletions
--- a/.azure-pipelines/codecov.yml
+++ b/.azure-pipelines/codecov.yml
@@ -0,0 +1,93 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,11 +41,10 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      perfBaselineFile: test/deploy/perf_ndmv5.jsonl
      gpuArch:          '90'
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
    image: $[ variables['containerImage'] ]

  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
  - task: Bash@3
    displayName: Add HostEntry
    inputs:
@@ -77,107 +50,46 @@ jobs:
          echo "Entry already exists, nothing to do."
        fi

-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/deploy.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT

-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test

-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut

-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests

-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark

-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
-
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yaml
@@ -40,11 +40,10 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      nvccGencode:      "-gencode=arch=compute_80,code=sm_80"

 - job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -40,9 +40,8 @@ jobs:
    image: $(containerImage)

  steps:
-  - template: templates/rccl-test.yaml
+  - template: templates/rccl-test.yml
    parameters:
      subscription:     mscclpp-ci-mi300x
      vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          gfx942
--- a/.azure-pipelines/templates/codecov.yml
+++ b/.azure-pipelines/templates/codecov.yml
@@ -0,0 +1,110 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+- template: run-remote-task.yml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      echo "Running unit_tests..."
+      ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 4: PASSED"
+
+- template: run-remote-task.yml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
+      GCOV_TOOL_ARG=""
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated."
+        exit 1
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
+      ls -la coverage.info
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/deploy.yml
+++ b/.azure-pipelines/templates/deploy.yml
@@ -0,0 +1,131 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: string
+  default: 'true'
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+# 1. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        eval CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        eval cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 2. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 3. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/.azure-pipelines/templates/integration-test.yaml
+++ b/.azure-pipelines/templates/integration-test.yaml
@@ -1,242 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/integration-test.yml
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: perfBaselineFile
+  type: string
+  default: 'test/deploy/perf_ndmv4.jsonl'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ b/.azure-pipelines/templates/nccl-test.yaml
@@ -1,282 +0,0 @@
-# .azure-pipelines/templates/nccl-test.yaml
-# ----------------------------------------
-# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: nvccGencode
-  type: string
-  default: "-gencode=arch=compute_80,code=sm_80"
-
-steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
-        if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
-        cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/nccl-test.yml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -0,0 +1,76 @@
+# .azure-pipelines/templates/nccl-test.yml
+# ----------------------------------------
+# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: nvccGencode
+  type: string
+  default: "-gencode=arch=compute_80,code=sm_80"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ b/.azure-pipelines/templates/rccl-test.yaml
@@ -1,142 +0,0 @@
-# .azure-pipelines/templates/rccl-test.yaml
-# ------------------------------------------------
-# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   vmssName         – VMSS name to start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-#   gpuArch          – GPU architecture (e.g. gfx942)
-
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-  default: "gfx942"
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true rocm"
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-
- task: Bash@3
-  name: InstallRcclTests
-  displayName: Install RCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd;                                                        \
-        git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
-        cd rocm-systems;                                           \
-        git sparse-checkout init --cone;                           \
-        git sparse-checkout set projects/rccl-tests;               \
-        git checkout;                                              \
-        cd projects/rccl-tests;                                    \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunRcclAllGatherTest
-  displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: RunRcclAllReduceTest
-  displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/rccl-test.yml
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -0,0 +1,63 @@
+# .azure-pipelines/templates/rccl-test.yml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/run-remote-task.yml
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
--- a/.azure-pipelines/templates/stop.yml
+++ b/.azure-pipelines/templates/stop.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yaml
@@ -1,191 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: StopContainer
-  displayName: Stop existing container
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-        "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
-      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-  
- task: Bash@3
-  name: BuildWithIb
-  displayName: Rebuild with IB
-  inputs:
-    targetType: 'inline'
-    script: |
-      rm -rf build && mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: DeployTestEnvWithIb
-  displayName: Deploy Test Env (with IB build)
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
- task: Bash@3
-  name: PyTestsWithIbBuildDisableIb
-  displayName: Run pytests (IB build, IB tests disabled)
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut-no-ib-env.yml
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -0,0 +1,95 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
+
+- task: Bash@3
+  displayName: Remove generated SSH key files
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ b/.azure-pipelines/templates/ut-npkit.yaml
@@ -1,145 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: gpuArch
-  type: string
-
-
-steps:
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut-npkit.yml
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -0,0 +1,57 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -1,142 +0,0 @@
-parameters:
- name: subscription
-  type: string
- name: vmssName
-  type: string
- name: sshKeySecureFile
-  type: string
- name: platform
-  type: string
-  default: 'cuda'
- name: gpuArch
-  type: string
-
-steps:
- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      fi
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true ${{ parameters.platform }}"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
--- a/.azure-pipelines/templates/ut.yml
+++ b/.azure-pipelines/templates/ut.yml
@@ -0,0 +1,48 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
--- a/.azure-pipelines/ut-rocm.yml
+++ b/.azure-pipelines/ut-rocm.yml
@@ -1,50 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - apps/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - apps/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
- job: UnitTestMI300X
-  timeoutInMinutes: 40
-  pool:
-    name: msccl-ci-mi300x
-  strategy:
-    matrix:
-      rocm6_2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  container:
-    image: $(containerImage)
-
-  steps:
-  - template: templates/ut.yaml
-    parameters:
-      subscription:     mscclpp-ci-mi300x
-      vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
-      platform:         rocm
-      gpuArch:          gfx942
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,17 +37,16 @@ jobs:
      cuda11:
        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
      cuda11:
        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
    parameters:
      subscription:     mscclpp-ci
      vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '80'

 - job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'

 - job: UnitTestWithNpKitH100
@@ -99,17 +96,16 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'

 - job: UnitTestNoIBEnv
@@ -121,15 +117,34 @@ jobs:
  strategy:
    matrix:
      cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

  container:
    image: $(containerImage)

  steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
    parameters:
      subscription:     mscclpp-ci-h100
      vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
      gpuArch:          '90'
+
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
      fail-fast: false
      matrix:
        language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]

    steps:
    - name: Checkout repository
@@ -62,7 +62,7 @@ jobs:
    - name: Build
      run: |
        rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
        make -j4

    - name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
    - name: Build
      run: |
        rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
        make -j4

    - name: Perform CodeQL Analysis
--- a/.github/workflows/doc-build.yaml
+++ b/.github/workflows/doc-build.yaml
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
    strategy:
        fail-fast: false
        matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]

    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .vscode/
 build/
+build_coverage/
 __pycache__
 .*.swp
 *.so
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.

 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
 option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")

@@ -99,6 +100,62 @@ else()
        message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
    endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(LCOV_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
    string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
    string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
--- a/README.md
+++ b/README.md
@@ -3,15 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)

 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
-| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
-| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |

 A GPU-driven communication stack for scalable AI applications.

--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        htop \
-        lcov \
        vim \
        && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* /tmp/*

+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
    CMAKE_VERSION="3.26.4" && \
@@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
        export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
    fi && \
    pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter

 # Cleanup
 RUN rm -rf /tmp/mscclpp
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,27 +4,27 @@ set -e

 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
    ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
    ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
    ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
    ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )

 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
    ["rocm6.2"]="/opt/rocm/lib"
 )

 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
    ["cuda12.4"]="23.07-0.5.1.2"
    ["cuda12.8"]="24.10-1.1.4.0"
    ["cuda12.9"]="24.10-1.1.4.0"
@@ -36,7 +36,7 @@ TARGET=${1}
 OS_ARCH=$(uname -m)

 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }

 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se

 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.

 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec

 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.

-find_package(MPI)
+find_package(MPI REQUIRED)

 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 if(MSCCLPP_USE_IB)
    list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
-set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
 set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)

@@ -17,6 +16,7 @@ if(MSCCLPP_USE_ROCM)
    foreach(arch ${MSCCLPP_GPU_ARCHS})
        add_compile_options(--offload-arch=${arch})
    endforeach()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
 endif()

 function(add_test_executable name sources)
@@ -38,28 +38,25 @@ add_test_executable(executor_test executor_test.cc)
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)

 include(CTest)
-include(FetchContent)
-FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
-option(INSTALL_GTEST OFF)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
+
+# Build test framework library
+add_library(test_framework STATIC framework.cc)
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)

 # Unit tests
 add_executable(unit_tests)
-target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
+target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
 target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(unit)
-gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME unit_tests COMMAND unit_tests)

 # Multi-process unit tests
 add_executable(mp_unit_tests)
-target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
+target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
 target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(mp_unit)
-gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)

 # mscclpp-test
 add_subdirectory(mscclpp-test)
-
-# Performance tests
-add_subdirectory(perf)
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,4 +1,4 @@
-set -e
+set -ex

 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
--- a/test/deploy/run-remote.sh
+++ b/test/deploy/run-remote.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Run a command on remote CI VMs via parallel-ssh.
+# By default, runs inside the mscclpp-test docker container.
+#
+# Usage:
+#   run-remote.sh [OPTIONS] < <command_script>
+#
+# Options:
+#   --no-docker   Run command directly on the host, not inside docker
+#   --no-log      Don't tail the log file in the background
+#   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
+#   --host        Run command on a single host (uses parallel-ssh -H)
+#   --user        SSH user when using --host or custom hostfile
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
+SSH_OPTION="StrictHostKeyChecking=no"
+KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
+
+USE_DOCKER=true
+USE_LOG=true
+TARGET_HOST=""
+REMOTE_USER=""
+
+usage() {
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+}
+
+require_value() {
+    local opt="$1"
+    local val="$2"
+    if [ -z "$val" ]; then
+        echo "Missing value for ${opt}" >&2
+        exit 1
+    fi
+}
+
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --no-docker) USE_DOCKER=false; shift ;;
+        --no-log)    USE_LOG=false; shift ;;
+        --hostfile)
+            require_value "--hostfile" "${2-}"
+            HOSTFILE="$2"
+            shift 2
+            ;;
+        --host)
+            require_value "--host" "${2-}"
+            TARGET_HOST="$2"
+            shift 2
+            ;;
+        --user)
+            require_value "--user" "${2-}"
+            REMOTE_USER="$2"
+            shift 2
+            ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ $# -ne 0 ] || [ -t 0 ]; then
+    usage
+    exit 1
+fi
+
+CMD=$(cat)
+if [ -z "$CMD" ]; then
+    usage
+    exit 1
+fi
+CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
+
+PSSH_TARGET_ARGS=()
+if [ -n "$TARGET_HOST" ]; then
+    PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
+else
+    PSSH_TARGET_ARGS=(-h "$HOSTFILE")
+fi
+
+PSSH_USER_ARGS=()
+if [ -n "$REMOTE_USER" ]; then
+    PSSH_USER_ARGS=(-l "$REMOTE_USER")
+fi
+
+PSSH_COMMON=(
+    -t 0
+    "${PSSH_TARGET_ARGS[@]}"
+    "${PSSH_USER_ARGS[@]}"
+    -x "-i ${KeyFilePath}"
+    -O "$SSH_OPTION"
+)
+
+if $USE_DOCKER; then
+    INNER="set -euxo pipefail;"
+    INNER+=" cd /root/mscclpp;"
+    INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
+    INNER+=" CMD_B64='${CMD_B64}';"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
+else
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+fi
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,6 +1,5 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
-export PATH=/usr/local/mpi/bin:$PATH

 function run_mscclpp_test()
 {
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s

 int main(int argc, char* argv[]) {
  if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>"
-              << " <execution plan path>"
-              << " <number of iterations>"
-              << " <number of graph iterations>"
-              << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
+              << " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
    return 1;
  }

@@ -142,7 +139,8 @@ int main(int argc, char* argv[]) {
    NpKit::Shutdown();
  }

-  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
+  double latencyUs = deltaSec * 1.e6;
+  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
  MPI_Finalize();
  return 0;
 }
--- a/test/framework.cc
+++ b/test/framework.cc
@@ -0,0 +1,323 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "framework.hpp"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+namespace mscclpp {
+namespace test {
+
+// Global state
+static int gMpiRank = 0;
+static int gMpiSize = 1;
+static bool gMpiInitialized = false;
+static bool gCurrentTestPassed = true;
+static std::string gCurrentTestFailureMessage;
+static std::string gCurrentTestName;
+
+std::string currentTestName() { return gCurrentTestName; }
+
+namespace utils {
+
+void initializeMPI(int argc, char* argv[]) {
+  if (gMpiInitialized) return;
+
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(&argc, &argv);
+  }
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
+  MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
+  gMpiInitialized = true;
+}
+
+static void finalizeMPI() {
+  if (!gMpiInitialized) return;
+
+  MPI_Finalize();
+  gMpiInitialized = false;
+}
+
+bool isMainRank() { return gMpiRank == 0; }
+
+int getMPIRank() { return gMpiRank; }
+
+int getMPISize() { return gMpiSize; }
+
+void cleanupMPI() { finalizeMPI(); }
+
+void reportFailure(const char* file, int line, const std::string& message) {
+  gCurrentTestPassed = false;
+  std::ostringstream oss;
+  oss << file << ":" << line << ": " << message;
+  if (!gCurrentTestFailureMessage.empty()) {
+    gCurrentTestFailureMessage += "\n";
+  }
+  gCurrentTestFailureMessage += oss.str();
+  std::cerr << oss.str() << std::endl;
+}
+
+void reportSuccess() {
+  gCurrentTestPassed = true;
+  gCurrentTestFailureMessage.clear();
+}
+
+// Timer implementation
+Timer::Timer() : isRunning_(false) {}
+
+void Timer::start() {
+  startTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = true;
+}
+
+void Timer::stop() {
+  endTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = false;
+}
+
+double Timer::elapsedMicroseconds() const {
+  if (isRunning_) {
+    auto now = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
+}
+
+double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
+
+double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
+
+void cudaCheck(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::string msg =
+        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace utils
+
+// TestRegistry implementation
+TestRegistry& TestRegistry::instance() {
+  static TestRegistry registry;
+  return registry;
+}
+
+void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                                bool isPerfTest) {
+  tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
+}
+
+void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
+
+// Returns true if the test should run given the filter string.
+// Filter syntax:
+//   ""          -> run all
+//   "Pattern"   -> run only tests whose full name contains Pattern
+//   "-Pattern"  -> run all tests EXCEPT those whose full name contains Pattern
+static bool matchesFilter(const std::string& fullName, const std::string& filter) {
+  if (filter.empty()) return true;
+  if (filter[0] == '-') {
+    // Negative filter: exclude tests matching any comma-separated pattern
+    std::string patterns = filter.substr(1);
+    size_t pos = 0;
+    while (pos < patterns.size()) {
+      size_t comma = patterns.find(',', pos);
+      std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
+      if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
+        return false;
+      }
+      pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
+    }
+    return true;
+  }
+  // Positive filter: include only matching tests
+  return fullName.find(filter) != std::string::npos;
+}
+
+int TestRegistry::runAllTests(int argc, char* argv[]) {
+  // Initialize MPI if not already initialized
+  if (!gMpiInitialized) {
+    utils::initializeMPI(argc, argv);
+  }
+
+  // Parse command line arguments
+  std::string filter;
+  bool excludePerfTests = false;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg.find("--filter=") == 0) {
+      filter = arg.substr(9);  // Length of "--filter="
+    } else if (arg == "--filter" && i + 1 < argc) {
+      filter = argv[i + 1];
+      ++i;
+    } else if (arg == "--exclude-perf-tests") {
+      excludePerfTests = true;
+    }
+  }
+
+  // Set up global test environments
+  for (auto* env : environments_) {
+    try {
+      env->SetUp();
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
+      }
+      return 1;
+    }
+  }
+
+  int passed = 0;
+  int failed = 0;
+  int skipped = 0;
+
+  // Count tests to run
+  int totalToRun = 0;
+  int skippedByFilter = 0;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+    if (excludePerfTests && entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
+    if (!matchesFilter(fullName, filter)) {
+      skippedByFilter++;
+      continue;
+    }
+    totalToRun++;
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] Running " << totalToRun << " tests";
+    if (skippedByFilter > 0) {
+      std::cout << " (" << skippedByFilter << " skipped by filter)";
+    }
+    std::cout << ".\n";
+  }
+
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+
+    if (excludePerfTests && entry.isPerfTest) continue;
+    if (!matchesFilter(fullName, filter)) continue;
+
+    gCurrentTestPassed = true;
+    gCurrentTestFailureMessage.clear();
+    gCurrentTestName = fullName;
+
+    if (gMpiRank == 0) {
+      std::cout << "[ RUN      ] " << fullName << std::endl;
+    }
+
+    TestCase* testCase = nullptr;
+    bool testSkipped = false;
+    bool setUpSucceeded = false;
+    try {
+      testCase = entry.factory();
+      testCase->SetUp();
+      setUpSucceeded = true;
+      testCase->TestBody();
+    } catch (const SkipException& e) {
+      gCurrentTestPassed = true;
+      testSkipped = true;
+      if (gMpiRank == 0) {
+        std::cout << "[  SKIPPED ] " << fullName << ": " << e.what() << std::endl;
+      }
+    } catch (const std::exception& e) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = e.what();
+      }
+    } catch (...) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = "Unknown exception";
+      }
+    }
+
+    // Always call TearDown() if SetUp() succeeded, even if TestBody() threw
+    if (setUpSucceeded && testCase != nullptr) {
+      try {
+        testCase->TearDown();
+      } catch (const std::exception& e) {
+        // If test already failed, keep original failure message
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
+        }
+      } catch (...) {
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
+        }
+      }
+    }
+
+    delete testCase;
+    gCurrentTestName.clear();
+
+    if (testSkipped) {
+      skipped++;
+      continue;
+    }
+
+    // Synchronize test status across all MPI processes
+    int localPassed = gCurrentTestPassed ? 1 : 0;
+    int globalPassed = 1;
+    if (gMpiInitialized) {
+      MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    } else {
+      globalPassed = localPassed;
+    }
+
+    if (gMpiRank == 0) {
+      if (globalPassed) {
+        std::cout << "[       OK ] " << fullName << std::endl;
+        passed++;
+      } else {
+        std::cout << "[  FAILED  ] " << fullName << std::endl;
+        failed++;
+      }
+    }
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] " << totalToRun << " tests ran.\n";
+    if (passed > 0) {
+      std::cout << "[  PASSED  ] " << passed << " tests.\n";
+    }
+    if (skipped > 0) {
+      std::cout << "[  SKIPPED ] " << skipped << " tests.\n";
+    }
+    if (failed > 0) {
+      std::cout << "[  FAILED  ] " << failed << " tests.\n";
+    }
+  }
+
+  // Tear down global test environments (in reverse order)
+  for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
+    try {
+      (*it)->TearDown();
+      delete *it;
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
+      }
+    }
+  }
+  environments_.clear();
+
+  return failed > 0 ? 1 : 0;
+}
+
+}  // namespace test
+}  // namespace mscclpp
--- a/test/framework.hpp
+++ b/test/framework.hpp
@@ -0,0 +1,405 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
+#define MSCCLPP_TEST_FRAMEWORK_HPP_
+
+#include <mpi.h>
+
+#include <chrono>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <mscclpp/gpu.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace mscclpp {
+namespace test {
+
+// Test case base class
+class TestCase {
+ public:
+  virtual ~TestCase() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+  virtual void TestBody() = 0;
+};
+
+// Environment base class (for global test setup/teardown)
+class Environment {
+ public:
+  virtual ~Environment() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+// Test registry and runner
+class TestRegistry {
+ public:
+  using TestFactory = std::function<TestCase*()>;
+
+  static TestRegistry& instance();
+
+  void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                    bool isPerfTest = false);
+  void addEnvironment(Environment* env);
+  int runAllTests(int argc, char* argv[]);
+
+ private:
+  TestRegistry() = default;
+  struct TestEntry {
+    std::string suiteName;
+    std::string testName;
+    TestFactory factory;
+    bool isPerfTest;
+  };
+  std::vector<TestEntry> tests_;
+  std::vector<Environment*> environments_;
+};
+
+// Returns "Suite.Name" for the currently running test, or "" if none.
+std::string currentTestName();
+
+// Utility functions
+namespace utils {
+
+// MPI management
+void initializeMPI(int argc, char* argv[]);
+void cleanupMPI();
+bool isMainRank();
+int getMPIRank();
+int getMPISize();
+
+// Timing utilities
+class Timer {
+ public:
+  Timer();
+  void start();
+  void stop();
+  double elapsedMicroseconds() const;
+  double elapsedMilliseconds() const;
+  double elapsedSeconds() const;
+
+ private:
+  std::chrono::high_resolution_clock::time_point startTime_;
+  std::chrono::high_resolution_clock::time_point endTime_;
+  bool isRunning_;
+};
+
+// CUDA utilities
+void cudaCheck(cudaError_t err, const char* file, int line);
+#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
+
+// Test assertion helpers
+void reportFailure(const char* file, int line, const std::string& message);
+void reportSuccess();
+
+}  // namespace utils
+
+// Exception for test skips
+class SkipException : public std::runtime_error {
+ public:
+  explicit SkipException(const std::string& message) : std::runtime_error(message) {}
+};
+
+// Helper class for FAIL() macro — supports message streaming via operator<<
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
+// Usage: SKIP_TEST() << "Reason for skipping";
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      throw SkipException("Test skipped: " + msg);
+    } else {
+      throw SkipException("Test skipped");
+    }
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
+// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
+namespace detail {
+template <typename...>
+using void_t = void;
+
+template <typename T, typename = void_t<>>
+struct FixtureOf {
+  using type = TestCase;
+};
+template <typename T>
+struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
+  using type = T;
+};
+}  // namespace detail
+
+}  // namespace test
+}  // namespace mscclpp
+
+// --- Test registration macros ---
+// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
+// Otherwise, the test inherits from TestCase (no fixture needed).
+
+#define TEST(test_fixture, test_name)                                                                       \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); });            \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+#define PERF_TEST(test_fixture, test_name)                                                                  \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true);      \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+// --- Test runner macro ---
+#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
+
+// Assertion macros
+#define EXPECT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+    }                                                                                                   \
+  } while (0)
+
+#define EXPECT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+    }                                                                                                    \
+  } while (0)
+
+#define EXPECT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+      throw std::runtime_error("Test assertion failed");                                                \
+    }                                                                                                   \
+  } while (0)
+
+#define ASSERT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+      throw std::runtime_error("Test assertion failed");                                                 \
+    }                                                                                                    \
+  } while (0)
+
+#define ASSERT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NO_THROW(statement)                                                                         \
+  do {                                                                                                     \
+    try {                                                                                                  \
+      statement;                                                                                           \
+    } catch (const std::exception& e) {                                                                    \
+      std::ostringstream oss;                                                                              \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();                        \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());                                \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    } catch (...) {                                                                                        \
+      ::mscclpp::test::utils::reportFailure(                                                               \
+          __FILE__, __LINE__, "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    }                                                                                                      \
+  } while (0)
+
+// --- Test control macros ---
+
+// Fail the current test immediately. Usage: FAIL() << "reason";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
+
+// Skip the current test. Usage: SKIP_TEST() << "reason";
+#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
+
+#endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <mpi.h>

@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
  bootstrapTestSendRecv(bootstrap);
 }

-TEST_F(BootstrapTest, WithId) {
+TEST(BootstrapTest, WithId) {
  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
  mscclpp::UniqueId id;
  if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
  bootstrapTestAll(bootstrap);
 }

-TEST_F(BootstrapTest, WithIpPortPair) {
+TEST(BootstrapTest, WithIpPortPair) {
  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
  bootstrap->initialize(gEnv->args["ip_port"]);
  bootstrapTestAll(bootstrap);
 }

-TEST_F(BootstrapTest, ResumeWithId) {
+TEST(BootstrapTest, ResumeWithId) {
  // This test may take a few minutes.
  bootstrapTestTimer.set(300);

@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
  }
 }

-TEST_F(BootstrapTest, ResumeWithIpPortPair) {
+TEST(BootstrapTest, ResumeWithIpPortPair) {
  for (int i = 0; i < 5; ++i) {
    auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
    bootstrap->initialize(gEnv->args["ip_port"]);
  }
 }

-TEST_F(BootstrapTest, ExitBeforeConnect) {
+TEST(BootstrapTest, ExitBeforeConnect) {
  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
  bootstrap->createUniqueId();
 }

-TEST_F(BootstrapTest, TimeoutWithId) {
+TEST(BootstrapTest, TimeoutWithId) {
  mscclpp::Timer timer;

  // All ranks initialize a bootstrap with their own id (will hang)
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
    // Set bootstrap timeout to 1 second
    bootstrap->initialize(id, 1);
  } catch (const mscclpp::Error& e) {
-    ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
+    ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
  }

  // Timeout should be sligtly greater than 1 second
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
  }
 };

-TEST_F(BootstrapTest, MPIBootstrap) {
+TEST(BootstrapTest, MPIBootstrap) {
  auto bootstrap = std::make_shared<MPIBootstrap>();
  bootstrapTestAll(bootstrap);
 }
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <mpi.h>

@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
  return true;
 }

-TEST_F(CommunicatorTest, BasicWrite) {
+TEST(CommunicatorTest, BasicWrite) {
  if (gEnv->rank >= numRanksToUse) return;

  deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
  }
 }

-TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
+TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
  if (gEnv->rank >= numRanksToUse) return;

  std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
  communicator->bootstrap()->barrier();
 }

-TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
+TEST(CommunicatorTest, WriteWithHostSemaphores) {
  if (gEnv->rank >= numRanksToUse) return;

  std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <mpi.h>
+#include <unistd.h>

 #include <filesystem>
 #include <mscclpp/env.hpp>
@@ -22,7 +23,7 @@ std::string getExecutablePath() {

 void ExecutorTest::SetUp() {
  if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
-    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
  }
  MultiProcessTest::SetUp();

@@ -49,7 +50,7 @@ void ExecutorTest::TearDown() {
  MultiProcessTest::TearDown();
 }

-TEST_F(ExecutorTest, TwoNodesAllreduce) {
+TEST(ExecutorTest, TwoNodesAllreduce) {
  std::string executablePath = getExecutablePath();
  std::filesystem::path path = executablePath;
  std::filesystem::path executionFilesPath =
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <mpi.h>

@@ -18,9 +18,7 @@ void IbTestBase::SetUp() {
 }

 void IbPeerToPeerTest::SetUp() {
-#if !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+  REQUIRE_IBVERBS;

  IbTestBase::SetUp();

@@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
  qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }

-TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
+TEST(IbPeerToPeerTest, SimpleSendRecv) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
@@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
  }
 }

-TEST_F(IbPeerToPeerTest, MemoryConsistency) {
+TEST(IbPeerToPeerTest, MemoryConsistency) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
@@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
  EXPECT_EQ(res, 0);
 }

-TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
+TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <algorithm>

@@ -8,7 +8,7 @@
 void MemoryChannelOneToOneTest::SetUp() {
  // Need at least two ranks within a node
  if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
  }
  // Use only two ranks
  setNumRanksToUse(2);
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  // The least nelem is 2 for packet ping pong
-  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }

  int nTries = 1000000;
  communicator->bootstrap()->barrier();
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
  }
 }

-TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
+TEST(MemoryChannelOneToOneTest, PutPingPong) {
  if (gEnv->rank >= numRanksToUse) return;

  const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {

  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }

 __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
  }
 }

-TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
+TEST(MemoryChannelOneToOneTest, GetPingPong) {
  if (gEnv->rank >= numRanksToUse) return;

  const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {

  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }

 __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
  }
 }

-TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
  auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
    kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
  };
  packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }

-TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
  auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
    kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
  };
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
      continue;
    }

-    // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
+    // Unrecognized positional token: ignore
  }

  return options;
 }

 void MultiProcessTestEnv::SetUp() {
-  MPI_Init(NULL, NULL);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(NULL, NULL);
+  }
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
  // get the local number of nodes with MPI
@@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() {
 }

 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
  gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  ::testing::AddGlobalTestEnvironment(gEnv);
+  ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
  return RUN_ALL_TESTS();
 }

-TEST_F(MultiProcessTest, Prelim) {
+TEST(MultiProcessTest, Prelim) {
  // Test to make sure the MPI environment is set up correctly
  ASSERT_GE(gEnv->worldSize, 2);
 }

-TEST_F(MultiProcessTest, HostName) {
+TEST(MultiProcessTest, HostName) {
  const size_t maxNameLen = 1024;
  std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
  std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
  }
 }

-TEST_F(MultiProcessTest, HostHash) {
+TEST(MultiProcessTest, HostHash) {
  std::vector<uint64_t> buffer(gEnv->worldSize, 0);
  uint64_t hostHash = mscclpp::getHostHash();
  buffer[gEnv->rank] = hostHash;
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
 #define MSCCLPP_MP_UNIT_TESTS_HPP_

-#include <gtest/gtest.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
 #include <mscclpp/memory_channel.hpp>
@@ -13,10 +11,18 @@
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>

+#include "../framework.hpp"
 #include "ib.hpp"
 #include "utils_internal.hpp"

-class MultiProcessTestEnv : public ::testing::Environment {
+// Skip the current test if IBVerbs is not available in this build
+#if defined(USE_IBVERBS)
+#define REQUIRE_IBVERBS
+#else
+#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
+#endif
+
+class MultiProcessTestEnv : public ::mscclpp::test::Environment {
 public:
  MultiProcessTestEnv(int argc, const char** argv);

@@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id);
 int rankToLocalRank(int rank);
 int rankToNode(int rank);

-class MultiProcessTest : public ::testing::Test {
+class MultiProcessTest : public ::mscclpp::test::TestCase {
 protected:
  void TearDown() override;
 };
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
@@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  const int nTries = 1000;
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }

  proxyService->stopProxy();
 }
@@ -223,8 +209,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {

  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
  const int nTries = 1000;

  // Warm-up
@@ -247,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
  proxyService->stopProxy();
 }

-TEST_F(PortChannelOneToOneTest, PingPong) {
+TEST(PortChannelOneToOneTest, PingPong) {
  testPingPong(PingPongTestParams{
      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }

-TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
+  REQUIRE_IBVERBS;
  testPingPong(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
+TEST(PortChannelOneToOneTest, PingPongEthernet) {
  testPingPong(PingPongTestParams{
      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }

-TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
+TEST(PortChannelOneToOneTest, PingPongWithPoll) {
  testPingPong(PingPongTestParams{
      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }

-TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
+  REQUIRE_IBVERBS;
  testPingPong(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PingPongPerf) {
+TEST(PortChannelOneToOneTest, PingPongPerf) {
  testPingPongPerf(PingPongTestParams{
      .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }

-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
  testPingPongPerf(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
  testPingPongPerf(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
+TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
  testPingPongPerf(PingPongTestParams{
      .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -407,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
  std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();

  const int nTries = 1000;
-
  // The least nelem is 2 for packet ping pong
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           4 * 1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyLLPingPong<true>
+        <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }

  communicator->bootstrap()->barrier();

@@ -471,8 +424,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)

  proxyService->startProxy();

-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
  const int nTries = 1000000;

  // Warm-up
@@ -497,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
  proxyService->stopProxy();
 }

-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }

-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
+  REQUIRE_IBVERBS;
  testPacketPingPong(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }

-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
  testPacketPingPongPerf(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
  testPacketPingPongPerf(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
  testPingPong(PingPongTestParams{
      .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }

-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
  testPacketPingPong(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

 #include <algorithm>
 #include <mscclpp/switch_channel.hpp>
@@ -10,10 +10,10 @@
 void SwitchChannelTest::SetUp() {
  // Need at least two ranks within a node
  if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
  }
  if (!mscclpp::isNvlsSupported()) {
-    GTEST_SKIP();
+    SKIP_TEST();
  }
  // Use only two ranks
  setNumRanksToUse(2);
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
 void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }

 __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;

 __global__ void kernelSwitchReduce() {
 #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }

-TEST_F(SwitchChannelTest, SimpleAllReduce) {
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
+TEST(SwitchChannelTest, SimpleAllReduce) {
  if (gEnv->rank >= numRanksToUse) return;

  std::vector<int> ranks;
@@ -66,22 +77,13 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
  for (int i = 0; i < numRanksToUse; i++) {
    expected += i + 1.0f;
  }
-  ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
+  if (result != expected) {
+    std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
+  }
+  ASSERT_EQ(result, expected);
 }

-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
-
-__global__ void kernelSwitchReduceTwo() {
-#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan1.broadcast(0, val1);
-  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan2.broadcast(0, val2);
-#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-}
-
-TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
+TEST(SwitchChannelTest, TwoChannelsSameConnection) {
  if (gEnv->rank >= numRanksToUse) return;

  std::vector<int> ranks;
@@ -97,12 +99,9 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
  MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
  MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));

-  // Connection size must be large enough for two granularity-aligned buffers.
-  // The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
  const size_t connSize = buffer1.bytes() + buffer2.bytes();
  auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);

-  // Bind two separate buffers to the same connection
  auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
  auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);

@@ -132,6 +131,6 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
    expected1 += (i + 1.0f) * 1.0f;
    expected2 += (i + 1.0f) * 10.0f;
  }
-  ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
-  ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
+  ASSERT_EQ(result1, expected1);
+  ASSERT_EQ(result2, expected2);
 }
--- a/test/perf/CMakeLists.txt
+++ b/test/perf/CMakeLists.txt
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# Find required packages
-find_package(MPI REQUIRED)
-
-# Note: nlohmann_json::nlohmann_json target is already available from the main project
-
-# Set up common libraries and includes for tests
-set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(MSCCLPP_USE_IB)
-    list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
-endif()
-
-set(PERF_TEST_INC_COMMON 
-    PRIVATE ${PROJECT_SOURCE_DIR}/include 
-    SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-
-# Function to add a test executable
-function(add_perf_test_executable name sources)
-    if(MSCCLPP_USE_ROCM)
-        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
-    endif()
-    add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
-    
-    # Link nlohmann_json - use the target from main project
-    target_link_libraries(${name} nlohmann_json::nlohmann_json)
-    
-    if(MSCCLPP_USE_IB)
-        target_compile_definitions(${name} PRIVATE USE_IBVERBS)
-    endif()
-    
-    target_include_directories(${name} ${PERF_TEST_INC_COMMON})
-    target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
-    
-    # Set C++ standard
-    target_compile_features(${name} PRIVATE cxx_std_17)
-
-    set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
-endfunction()
-
-# Add FIFO test  
-add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")
--- a/test/perf/fifo_test.cu
+++ b/test/perf/fifo_test.cu
@@ -1,298 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <getopt.h>
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mscclpp/fifo.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/numa.hpp>
-#include <sstream>
-#include <stdexcept>
-
-#include "framework.hpp"
-
-using namespace mscclpp::test;
-
-// Constants for timeout and trigger calculation
-constexpr uint64_t TIMEOUT_SPINS = 1000000;
-constexpr int MIN_TRIGGERS = 1000;
-constexpr int MIN_WARMUP_TRIGGERS = 100;
-constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
-constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
-
-__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
-
-__global__ void kernelFifoPush(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  mscclpp::ProxyTrigger trigger;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.push(trigger);
-  }
-}
-
-__global__ void kernelFifoPushSync(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  mscclpp::ProxyTrigger trigger;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.sync(fifo.push(trigger));
-  }
-}
-
-static void setupCuda(int& cudaDevice, int& numaNode) {
-  utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
-  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
-  mscclpp::numaBind(numaNode);
-}
-
-// Helper function to consume triggers from FIFO
-static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
-  int totalTriggers = numTriggers * parallel;
-  std::unordered_map<int, int> triggerCounts;
-  for (int i = 0; i < totalTriggers; ++i) {
-    mscclpp::ProxyTrigger trigger;
-    uint64_t spin = 0;
-    do {
-      trigger = hostFifo->poll();
-      if (spin++ > TIMEOUT_SPINS) {
-        return false;
-      }
-    } while (trigger.fst == 0 || trigger.snd == 0);
-
-    // Process trigger (see src/proxy.cc)
-    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
-    trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
-    triggerCounts[trigger.snd]++;
-    hostFifo->pop();
-  }
-  return true;
-}
-
-// Helper function to run a single kernel variant and return performance metrics
-std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
-                                                            std::unique_ptr<mscclpp::Fifo>& hostFifo,
-                                                            cudaStream_t stream, int numParallel) {
-  // Calculate triggers based on FIFO size
-  const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
-  const int warmupTriggers =
-      std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
-
-  // Warmup
-  kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
-  if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};  // Return error values
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Benchmark
-  utils::Timer timer;
-  timer.start();
-
-  kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process all triggers
-  if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  timer.stop();
-
-  const int totalTriggers = numTriggers * numParallel;
-  double throughput = totalTriggers / timer.elapsedSeconds();
-  double duration_us = timer.elapsedMicroseconds();
-
-  utils::CUDA_CHECK(cudaDeviceSynchronize());
-
-  return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
-}
-
-void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
-                        nlohmann::ordered_json& combinedMetrics) {
-  auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
-      runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
-
-  auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
-      runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
-
-  auto formatThroughput = [](double thru) {
-    return double(int(thru * 10)) / 10.0;  // Round to 1 decimal place
-  };
-
-  std::string prefix = "p" + std::to_string(numParallel) + "_";
-  combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
-  combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
-  combinedMetrics[prefix + "push_duration_us"] = pushDuration;
-  combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
-  combinedMetrics[prefix + "num_triggers"] = numTriggers;
-  combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
-}
-
-struct FifoTestConfig {
-  int fifoSize;
-  std::vector<int> parallelismLevels;
-
-  // Constructor with default parallelism levels
-  FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
-      : fifoSize(size), parallelismLevels(parallel) {}
-};
-
-void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
-                 [[maybe_unused]] int localRank) {
-  if (config.fifoSize <= 0) {
-    throw std::invalid_argument("FIFO size must be positive");
-  }
-  if (config.parallelismLevels.empty()) {
-    throw std::invalid_argument("At least one parallelism level must be specified");
-  }
-
-  int cudaDevice, numaNode;
-  setupCuda(cudaDevice, numaNode);
-
-  auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
-
-  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
-
-  cudaStream_t stream;
-  utils::CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Create test name with parallelism range
-  std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
-
-  // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
-  if (!config.parallelismLevels.empty()) {
-    testName += std::to_string(config.parallelismLevels.front());
-    if (config.parallelismLevels.size() > 1) {
-      testName += "-" + std::to_string(config.parallelismLevels.back());
-
-      // If parallelism levels have non-standard steps, include more detail
-      if (config.parallelismLevels.size() > 2 &&
-          (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
-        testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
-      }
-    }
-  }
-
-  // Print test configuration
-  if (utils::isMainRank()) {
-    std::stringstream ss;
-    ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
-    for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-      if (i > 0) ss << ",";
-      ss << config.parallelismLevels[i];
-    }
-    ss << "]";
-    std::cout << ss.str() << std::endl;
-  }
-
-  nlohmann::ordered_json combinedMetrics;
-
-  for (int numParallel : config.parallelismLevels) {
-    runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
-  }
-
-  std::map<std::string, std::string> testParams;
-  testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
-
-  // Add parallelism levels to test parameters
-  std::stringstream parallelismStream;
-  for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-    if (i > 0) parallelismStream << ",";
-    parallelismStream << config.parallelismLevels[i];
-  }
-  testParams["parallelism_levels"] = parallelismStream.str();
-
-  utils::recordResult(testName, "fifo", combinedMetrics, testParams);
-
-  utils::CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
-  // clang-format off
-  std::vector<FifoTestConfig> configs = {
-      {1, {1}},
-      {128, {1, 8, 64, 128}},
-      {512, {1, 8, 64, 256, 512}},
-  };
-  // clang-format on
-
-  for (const auto& config : configs) {
-    runFifoTest(config, rank, worldSize, localRank);
-  }
-}
-
-static void printUsage(char* argv0) {
-  std::stringstream ss;
-  ss << "Usage: " << argv0 << " [OPTIONS]\n"
-     << "\n"
-     << "Options:\n"
-     << "  -o, --output-format FORMAT   Output format: human or json (default: human)\n"
-     << "  -f, --output-file FILE       JSON output file path (default: report.jsonl)\n"
-     << "  -v, --verbose                Increase verbosity\n"
-     << "  -h, --help                   Show this help message\n";
-  std::cout << ss.str();
-}
-
-int main(int argc, char* argv[]) {
-  std::string outputFormat = "human";
-  std::string outputFile = "report.jsonl";
-  bool verbose = false;
-
-  static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
-                                        {"output-file", required_argument, 0, 'f'},
-                                        {"verbose", no_argument, 0, 'v'},
-                                        {"help", no_argument, 0, 'h'},
-                                        {0, 0, 0, 0}};
-
-  int c;
-  while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
-    switch (c) {
-      case 'o':
-        outputFormat = optarg;
-        break;
-      case 'f':
-        outputFile = optarg;
-        break;
-      case 'v':
-        verbose = true;
-        break;
-      case 'h':
-        printUsage(argv[0]);
-        return 0;
-      default:
-        printUsage(argv[0]);
-        return 1;
-    }
-  }
-
-  std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
-      {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
-
-  int result = utils::runMultipleTests(argc, argv, tests);
-
-  if (utils::isMainRank()) {
-    if (outputFormat == "json") {
-      utils::writeResultsToFile(outputFile);
-    } else {
-      utils::printResults(verbose);
-    }
-  }
-
-  utils::cleanupMPI();
-
-  return result;
-}
--- a/test/perf/framework.cc
+++ b/test/perf/framework.cc
@@ -1,208 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "framework.hpp"
-
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace mscclpp {
-namespace test {
-
-// Global state for results
-static std::vector<TestResult> g_results;
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-
-namespace utils {
-
-// Internal MPI helper functions (not exposed in header)
-void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
-}
-
-static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
-
-  MPI_Finalize();
-  g_mpi_initialized = false;
-}
-
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
-static bool isMainProcess() { return g_mpi_rank == 0; }
-
-// Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
-
-void cleanupMPI() { finalizeMPI(); }
-
-std::string getCurrentTimestamp() {
-  auto now = std::chrono::system_clock::now();
-  auto time_t = std::chrono::system_clock::to_time_t(now);
-  std::stringstream ss;
-  ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
-  return ss.str();
-}
-
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params) {
-  TestResult result;
-  result.test_name = test_name;
-  result.test_category = test_category;
-  result.test_params = test_params;
-  result.metrics = metrics;
-  result.num_processes = g_mpi_size;
-  result.process_rank = g_mpi_rank;
-  result.timestamp = getCurrentTimestamp();
-
-  g_results.push_back(result);
-}
-
-void writeResultsToFile(const std::string& filename) {
-  std::ofstream file(filename);
-  if (!file) {
-    throw std::runtime_error("Cannot open output file: " + filename);
-  }
-
-  for (const auto& result : g_results) {
-    nlohmann::ordered_json j;
-    j["test_name"] = result.test_name;
-    j["test_category"] = result.test_category;
-    j["test_config"] = result.test_params;
-    j["metrics"] = result.metrics;
-    j["num_processes"] = result.num_processes;
-    j["process_rank"] = result.process_rank;
-    j["timestamp"] = result.timestamp;
-
-    file << j.dump() << std::endl;
-  }
-}
-
-void printResults(bool verbose) {
-  if (!isMainProcess()) return;
-
-  std::cout << "\n=== Test Results ===" << std::endl;
-
-  for (const auto& result : g_results) {
-    std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
-
-    if (verbose && !result.test_params.empty()) {
-      std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.test_params) {
-        std::cout << "    " << param.first << ": " << param.second << std::endl;
-      }
-    }
-
-    std::cout << "  Metrics:" << std::endl;
-    for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
-      std::cout << "    " << it.key() << ": " << it.value() << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// Timer implementation
-Timer::Timer() : is_running_(false) {}
-
-void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
-}
-
-void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
-}
-
-double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
-    auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
-  }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-}
-
-double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
-
-double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
-
-void cudaCheck(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::string msg =
-        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
-    throw std::runtime_error(msg);
-  }
-}
-
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
--- a/test/perf/framework.hpp
+++ b/test/perf/framework.hpp
@@ -1,80 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-
-#include <mpi.h>
-
-#include <chrono>
-#include <fstream>
-#include <functional>
-#include <map>
-#include <mscclpp/gpu.hpp>
-#include <nlohmann/json.hpp>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace mscclpp {
-namespace test {
-
-// Test result structure
-struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
-// Simple utility functions for testing
-namespace utils {
-
-// Test execution utilities
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
-
-// MPI management
-void initializeMPI(int argc, char* argv[]);
-void cleanupMPI();
-bool isMainRank();
-
-// Result recording
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params = {});
-
-// Output utilities
-void writeResultsToFile(const std::string& filename);
-void printResults(bool verbose = false);
-void cleanupMPI();
-
-// Timing utilities
-class Timer {
- public:
-  Timer();
-  void start();
-  void stop();
-  double elapsedMicroseconds() const;
-  double elapsedMilliseconds() const;
-  double elapsedSeconds() const;
-
- private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
-};
-
-// CUDA utilities
-void cudaCheck(cudaError_t err, const char* file, int line);
-#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
-
-}  // namespace utils
-
-}  // namespace test
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,11 +1,13 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.

 target_sources(unit_tests PRIVATE
+    unit_tests_main.cc
    core_tests.cc
    gpu_utils_tests.cc
    errors_tests.cc
    fifo_tests.cu
+    fifo_perf_tests.cu
    numa_tests.cc
    socket_tests.cc
    utils_tests.cc
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.

-#include <gtest/gtest.h>
+#include "../framework.hpp"

 #undef NDEBUG
 #ifndef DEBUG_BUILD
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/core.hpp>

-class LocalCommunicatorTest : public ::testing::Test {
+#include "../framework.hpp"
+
+// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
+class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
 protected:
  void SetUp() override {
    bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
@@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test {
  std::shared_ptr<mscclpp::Communicator> comm;
 };

-TEST_F(LocalCommunicatorTest, RegisterMemory) {
+TEST(LocalCommunicatorTest, RegisterMemory) {
  int dummy[42];
  auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
  EXPECT_EQ(memory.data(), &dummy);
  EXPECT_EQ(memory.size(), sizeof(dummy));
-  EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
+  ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }

-TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
+TEST(LocalCommunicatorTest, SendMemoryToSelf) {
  int dummy[42];
  auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
  comm->sendMemory(memory, 0);
@@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
  auto sameMemory = memoryFuture.get();
  EXPECT_EQ(sameMemory.data(), memory.data());
  EXPECT_EQ(sameMemory.size(), memory.size());
-  EXPECT_EQ(sameMemory.transports(), memory.transports());
+  ASSERT_TRUE(sameMemory.transports() == memory.transports());
 }
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,30 +1,33 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/errors.hpp>

+#include "../framework.hpp"
+
+// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 TEST(ErrorsTest, SystemError) {
  mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
  EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
 }

 TEST(ErrorsTest, InternalError) {
  mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
  EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
 }

 TEST(ErrorsTest, InvalidUsage) {
  mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
  EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
 }

 TEST(ErrorsTest, Timeout) {
  mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
  EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
 }
--- a/test/unit/fifo_perf_tests.cu
+++ b/test/unit/fifo_perf_tests.cu
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/numa.hpp>
+#include <unordered_map>
+
+#include "../framework.hpp"
+
+// Simple FIFO performance test to be run as part of unit_tests
+// This is a performance test that can be excluded from coverage runs
+// using the --exclude-perf-tests flag.
+
+constexpr uint64_t TIMEOUT_SPINS = 1000000;
+constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution
+
+__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
+
+__global__ void kernelFifoPerfPush(size_t numTriggers) {
+  mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  mscclpp::ProxyTrigger trigger;
+  for (size_t i = 1; i <= numTriggers; ++i) {
+    trigger.fst = i;
+    trigger.snd = tid ^ i;
+    fifo.push(trigger);
+  }
+}
+
+static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
+  int totalTriggers = numTriggers * parallel;
+  std::unordered_map<int, int> triggerCounts;
+  for (int i = 0; i < totalTriggers; ++i) {
+    mscclpp::ProxyTrigger trigger;
+    uint64_t spin = 0;
+    do {
+      trigger = hostFifo->poll();
+      if (spin++ > TIMEOUT_SPINS) {
+        return false;
+      }
+    } while (trigger.fst == 0 || trigger.snd == 0);
+
+    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
+    trigger.snd = trigger.snd ^ trigger.fst;
+    if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
+      return false;  // Validation failed
+    }
+    triggerCounts[trigger.snd]++;
+    hostFifo->pop();
+  }
+  return true;
+}
+
+PERF_TEST(FifoPerfTest, BasicPerformance) {
+  int cudaDevice, numaNode;
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
+  mscclpp::numaBind(numaNode);
+
+  const int fifoSize = 128;
+  const int numTriggers = MIN_TRIGGERS;
+  const int numParallel = 1;
+
+  auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
+  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Run kernel
+  kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
+  CUDA_CHECK(cudaGetLastError());
+
+  // Process triggers
+  bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
+  ASSERT_TRUE(success);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>

+#include "../framework.hpp"
 #include "utils_internal.hpp"

 #define ITER 10000  // should be larger than the FIFO size for proper testing
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/gpu_utils.hpp>

+#include "../framework.hpp"
+
 TEST(GpuUtilsTest, StreamPool) {
  auto streamPool = mscclpp::gpuStreamPool();
  cudaStream_t s;
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/port_channel_device.hpp>

+#include "../framework.hpp"
+
 #define MAGIC_CONST 777

 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>

+#include "../framework.hpp"
+
 TEST(NumaTest, Basic) {
  int num;
  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/utils.hpp>
 #include <thread>

+#include "../framework.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"

--- a/test/unit/unit_tests_main.cc
+++ b/test/unit/unit_tests_main.cc
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "../framework.hpp"
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.

-#include <gtest/gtest.h>
-
 #include <thread>

+#include "../framework.hpp"
 #include "utils_internal.hpp"

 TEST(UtilsInternalTest, getHostHash) {
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.

 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>

+#include "../framework.hpp"
+
 TEST(UtilsTest, getHostName) {
  std::string hostname1 = mscclpp::getHostName(1024, '.');
  EXPECT_FALSE(hostname1.empty());