From 5d18835417da4d3d95841179e19f69aeebf796f4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 19 Mar 2026 11:52:09 -0700
Subject: [PATCH 1/2] Fix use-after-free for fabric allocation handle in
 GpuIpcMemHandle (#764)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Fix a use-after-free where the CUDA allocation handle
(`CUmemGenericAllocationHandle`) was released prematurely while the
exported fabric handle still referenced it.

## Problem

Unlike POSIX FD handles (where the kernel keeps the allocation alive via
the open file descriptor), fabric handles do not hold their own
reference to the underlying allocation. The original code called
`cuMemRelease(allocHandle)` immediately after exporting the fabric
handle, freeing the allocation. When a remote process later tries to
`cuMemImportFromShareableHandle` using that fabric handle, it references
a freed allocation — a **use-after-free**.

This affected both code paths:

1. **`GpuIpcMemHandle::create()`**: The local `allocHandle` obtained via
`cuMemRetainAllocationHandle` was released right after fabric export,
leaving the fabric handle dangling.
2. **`GpuIpcMemHandle::createMulticast()`**: The `allocHandle` from
`cuMulticastCreate` was unconditionally released, even when it was the
only thing keeping the multicast object alive for the fabric handle.

## Fix

- **Added `allocHandle` field** to the `fabric` struct in
`GpuIpcMemHandle` to store the allocation handle and keep it alive for
the lifetime of the `GpuIpcMemHandle`.
- **`create()`**: Retain an additional reference via
`cuMemRetainAllocationHandle` and store it in `fabric.allocHandle` when
a fabric handle is successfully exported.
- **`createMulticast()`**: Store the `allocHandle` directly in
`fabric.allocHandle` instead of unconditionally releasing it. Only
release if fabric export was not used.
- **`deleter()`**: Release `fabric.allocHandle` via `cuMemRelease` when
the handle type includes `Fabric`, ensuring proper cleanup.
- **`GpuIpcMem` constructor (importer side)**: Clear
`fabric.allocHandle` after importing, since the importer gets its own
handle via `cuMemImportFromShareableHandle` and should not release the
exporter's allocation handle.

## Files Changed

- `src/core/include/gpu_ipc_mem.hpp` — Added
`CUmemGenericAllocationHandle allocHandle` to fabric struct.
- `src/core/gpu_ipc_mem.cc` — Retain/release allocation handle properly
across create, createMulticast, deleter, and importer paths.
---
 src/core/gpu_ipc_mem.cc          | 18 +++++++++++++++---
 src/core/include/gpu_ipc_mem.hpp |  1 +
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc
index bc9d375d..c863ecdd 100644
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
       UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
       ::close(handle->posixFd.fd);
     }
+    if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
+      if (handle->fabric.allocHandle != 0) {
+        cuMemRelease(handle->fabric.allocHandle);
+      }
+    }
     delete handle;
   }
 }
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   CUdeviceptr basePtr;
   size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
   // FABRIC handle
   if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
       CUDA_SUCCESS) {
+    MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
   }
 
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   handle->offsetFromBase = 0;
   handle->typeFlags = GpuIpcMemHandle::Type::None;
   handle->posixFd.fd = -1;
+  handle->fabric.allocHandle = {};
 
   // POSIX FD handle
   int fileDesc;
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
                                                          CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
     handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
+    handle->fabric.allocHandle = allocHandle;
   }
 
   if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
     THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
   }
 
-  // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
-  // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
-  MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  // Only release allocHandle if it is not stored in fabric.allocHandle.
+  if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
+    MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
+  }
   return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
   if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
     if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
         CUDA_SUCCESS) {
+      // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
+      handle_.fabric.allocHandle = {};
       type_ = GpuIpcMemHandle::Type::Fabric;
     }
   }
diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp
index 923e807d..f66545c2 100644
--- a/src/core/include/gpu_ipc_mem.hpp
+++ b/src/core/include/gpu_ipc_mem.hpp
@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {
 
   struct {
     char handle[64];
+    CUmemGenericAllocationHandle allocHandle;
   } fabric;
 
   static void deleter(GpuIpcMemHandle* handle);

From 93f6eeaa6b3db46cdf11d659835e81cedc9c94ff Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Tue, 24 Mar 2026 23:34:38 -0400
Subject: [PATCH 2/2] Remove GTest dependency, add code coverage, and refactor
 unit tests and CI pipelines (#744)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Removes the GTest dependency, replacing it with a minimal custom
framework (`test/framework.*`) that covers only what the tests actually
use — a unified `TEST()` macro with SFINAE-based fixture auto-detection,
`EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown.
- `--exclude-perf-tests` flag and substring-based negative filtering
- `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to
Codecov
- Merges standalone `test/perf/` into main test targets
- Refactors Azure pipelines to reduce redundancies & make more readable

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 .azure-pipelines/codecov.yml                  |  93 ++++
 .azure-pipelines/integration-test.yml         |   6 +-
 .azure-pipelines/multi-nodes-test.yml         | 164 ++-----
 .../{nccl-api-test.yaml => nccl-api-test.yml} |   6 +-
 .azure-pipelines/rccl-api-test.yml            |   3 +-
 .azure-pipelines/templates/codecov.yml        | 110 +++++
 .azure-pipelines/templates/deploy.yml         | 131 ++++++
 .../templates/integration-test.yaml           | 242 -----------
 .../templates/integration-test.yml            |  76 ++++
 .azure-pipelines/templates/nccl-test.yaml     | 282 ------------
 .azure-pipelines/templates/nccl-test.yml      |  76 ++++
 .azure-pipelines/templates/rccl-test.yaml     | 142 ------
 .azure-pipelines/templates/rccl-test.yml      |  63 +++
 .../templates/run-remote-task.yml             |  27 ++
 .azure-pipelines/templates/stop.yml           |  20 +
 .azure-pipelines/templates/ut-no-ib-env.yaml  | 191 ---------
 .azure-pipelines/templates/ut-no-ib-env.yml   |  95 ++++
 .azure-pipelines/templates/ut-npkit.yaml      | 145 -------
 .azure-pipelines/templates/ut-npkit.yml       |  57 +++
 .azure-pipelines/templates/ut.yaml            | 142 ------
 .azure-pipelines/templates/ut.yml             |  48 +++
 .azure-pipelines/ut-rocm.yml                  |  50 ---
 .azure-pipelines/ut.yml                       |  45 +-
 .codecov.yml                                  |  24 ++
 .github/workflows/codeql-analysis.yml         |   6 +-
 .../{doc-build.yaml => doc-build.yml}         |   0
 .github/workflows/integration-test-backup.yml |  69 ---
 .github/workflows/mscclpp-lang.yml            |   2 +-
 .github/workflows/ut-backup.yml               |  52 ---
 .gitignore                                    |   1 +
 CMakeLists.txt                                |  59 ++-
 README.md                                     |  13 +-
 docker/base-dev-x.dockerfile                  |  30 +-
 docker/build.sh                               |  18 +-
 docs/quickstart.md                            |   5 +-
 test/CMakeLists.txt                           |  27 +-
 test/deploy/deploy.sh                         |   2 +-
 test/deploy/run-remote.sh                     | 107 +++++
 test/deploy/run_tests.sh                      |   1 -
 test/executor_test.cc                         |  10 +-
 test/framework.cc                             | 323 ++++++++++++++
 test/framework.hpp                            | 405 ++++++++++++++++++
 test/mp_unit/bootstrap_tests.cc               |  18 +-
 test/mp_unit/communicator_tests.cu            |   8 +-
 test/mp_unit/executor_tests.cc                |   7 +-
 test/mp_unit/ib_tests.cu                      |  12 +-
 test/mp_unit/memory_channel_tests.cu          |  95 ++--
 test/mp_unit/mp_unit_tests.cc                 |  17 +-
 test/mp_unit/mp_unit_tests.hpp                |  14 +-
 test/mp_unit/port_channel_tests.cu            | 145 ++-----
 test/mp_unit/switch_channel_tests.cu          |  45 +-
 test/perf/CMakeLists.txt                      |  44 --
 test/perf/fifo_test.cu                        | 298 -------------
 test/perf/framework.cc                        | 208 ---------
 test/perf/framework.hpp                       |  80 ----
 test/unit/CMakeLists.txt                      |   4 +-
 test/unit/compile_tests.cu                    |   4 +-
 test/unit/core_tests.cc                       |  20 +-
 test/unit/errors_tests.cc                     |  17 +-
 test/unit/fifo_perf_tests.cu                  |  85 ++++
 test/unit/fifo_tests.cu                       |   5 +-
 test/unit/gpu_utils_tests.cc                  |   6 +-
 test/unit/local_channel_tests.cu              |   6 +-
 test/unit/numa_tests.cc                       |   6 +-
 test/unit/socket_tests.cc                     |   5 +-
 test/unit/unit_tests_main.cc                  |   6 +
 test/unit/utils_internal_tests.cc             |   3 +-
 test/unit/utils_tests.cc                      |   6 +-
 68 files changed, 2116 insertions(+), 2416 deletions(-)
 create mode 100644 .azure-pipelines/codecov.yml
 rename .azure-pipelines/{nccl-api-test.yaml => nccl-api-test.yml} (88%)
 create mode 100644 .azure-pipelines/templates/codecov.yml
 create mode 100644 .azure-pipelines/templates/deploy.yml
 delete mode 100644 .azure-pipelines/templates/integration-test.yaml
 create mode 100644 .azure-pipelines/templates/integration-test.yml
 delete mode 100644 .azure-pipelines/templates/nccl-test.yaml
 create mode 100644 .azure-pipelines/templates/nccl-test.yml
 delete mode 100644 .azure-pipelines/templates/rccl-test.yaml
 create mode 100644 .azure-pipelines/templates/rccl-test.yml
 create mode 100644 .azure-pipelines/templates/run-remote-task.yml
 create mode 100644 .azure-pipelines/templates/stop.yml
 delete mode 100644 .azure-pipelines/templates/ut-no-ib-env.yaml
 create mode 100644 .azure-pipelines/templates/ut-no-ib-env.yml
 delete mode 100644 .azure-pipelines/templates/ut-npkit.yaml
 create mode 100644 .azure-pipelines/templates/ut-npkit.yml
 delete mode 100644 .azure-pipelines/templates/ut.yaml
 create mode 100644 .azure-pipelines/templates/ut.yml
 delete mode 100644 .azure-pipelines/ut-rocm.yml
 create mode 100644 .codecov.yml
 rename .github/workflows/{doc-build.yaml => doc-build.yml} (100%)
 delete mode 100644 .github/workflows/integration-test-backup.yml
 delete mode 100644 .github/workflows/ut-backup.yml
 create mode 100755 test/deploy/run-remote.sh
 create mode 100644 test/framework.cc
 create mode 100644 test/framework.hpp
 delete mode 100644 test/perf/CMakeLists.txt
 delete mode 100644 test/perf/fifo_test.cu
 delete mode 100644 test/perf/framework.cc
 delete mode 100644 test/perf/framework.hpp
 create mode 100644 test/unit/fifo_perf_tests.cu
 create mode 100644 test/unit/unit_tests_main.cc

diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml
new file mode 100644
index 00000000..c4abeaa7
--- /dev/null
+++ b/.azure-pipelines/codecov.yml
@@ -0,0 +1,93 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: CodeCoverageA100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci
+      vmssName:         mscclpp-ci
+      gpuArch:          '80'
+
+- job: CodeCoverageH100
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-h100
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
+
+- job: CodeCoverageMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  variables:
+  - group: mscclpp
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/codecov.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index f6fe3a47..d5d5f9bd 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -41,11 +41,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/integration-test.yaml
+  - template: templates/integration-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       perfBaselineFile: test/deploy/perf_ndmv5.jsonl
       gpuArch:          '90'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 914c2317..d4924879 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -37,33 +37,6 @@ jobs:
     image: $[ variables['containerImage'] ]
 
   steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: DownloadSecureFile@1
-    name: SshKeyFile
-    displayName: Download key file
-    inputs:
-      secureFile: mscclpp-ssh.key
-
-  - task: Bash@3
-    name: InstallPackages
-    displayName: Install Packages
-    inputs:
-      targetType: 'inline'
-      script: |
-        sudo apt-get update -y
-        sudo apt-get install pssh -y
-        curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
   - task: Bash@3
     displayName: Add HostEntry
     inputs:
@@ -77,107 +50,46 @@ jobs:
           echo "Entry already exists, nothing to do."
         fi
 
-  - task: AzureCLI@2
-    name: StartVMSS
-    displayName: Start VMSS
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss start --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/deploy.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
 
-  - task: Bash@3
-    name: DeployTestEnv
-    displayName: Deploy Test Env
-    inputs:
-      targetType: filePath
-      filePath: test/deploy/deploy.sh
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMscclppTest
+      displayName: Run multi-nodes mscclpp-test
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
-  - task: Bash@3
-    name: RunMscclppTest
-    displayName: Run multi-nodes mscclpp-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodeUnitTest
+      displayName: Run multi-nodes unit tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
-  - task: Bash@3
-    name: RunMultiNodeUnitTest
-    displayName: Run multi-nodes unit tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonTests
+      displayName: Run multi-nodes python tests
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
-  - task: Bash@3
-    name: RunMultiNodePythonTests
-    displayName: Run multi-nodes python tests
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
-        kill $CHILD_PID
+  - template: templates/run-remote-task.yml
+    parameters:
+      name: RunMultiNodePythonBenchmark
+      displayName: Run multi-nodes python benchmark
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      remoteScript: |
+        bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
-  - task: Bash@3
-    name: RunMultiNodePythonBenchmark
-    displayName: Run multi-nodes python benchmark
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        rm -rf output/*
-        mkdir -p output
-        touch output/mscclit-000000
-        tail -f output/mscclit-000000 &
-        CHILD_PID=$!
-        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
-        kill $CHILD_PID
-
-  - task: AzureCLI@2
-    name: StopVMSS
-    displayName: Deallocate VMSS
-    condition: always()
-    inputs:
-      azureSubscription: msccl-it
-      scriptType: bash
-      scriptLocation: inlineScript
-      inlineScript: |
-        az vmss deallocate  --name mscclit-vmss --resource-group msccl-IT
+  - template: templates/stop.yml
+    parameters:
+      subscription:  msccl-it
+      vmssName:      mscclit-vmss
+      resourceGroup: msccl-IT
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml
similarity index 88%
rename from .azure-pipelines/nccl-api-test.yaml
rename to .azure-pipelines/nccl-api-test.yml
index 4951c5bd..cc017412 100644
--- a/.azure-pipelines/nccl-api-test.yaml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -40,11 +40,10 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/nccl-test.yaml
+  - template: templates/nccl-test.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml
index 92c5874f..43841079 100644
--- a/.azure-pipelines/rccl-api-test.yml
+++ b/.azure-pipelines/rccl-api-test.yml
@@ -40,9 +40,8 @@ jobs:
     image: $(containerImage)
 
   steps:
-  - template: templates/rccl-test.yaml
+  - template: templates/rccl-test.yml
     parameters:
       subscription:     mscclpp-ci-mi300x
       vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          gfx942
diff --git a/.azure-pipelines/templates/codecov.yml b/.azure-pipelines/templates/codecov.yml
new file mode 100644
index 00000000..08797351
--- /dev/null
+++ b/.azure-pipelines/templates/codecov.yml
@@ -0,0 +1,110 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildType:        Debug
+    cmakeArgs:        '-DMSCCLPP_ENABLE_COVERAGE=ON'
+    buildDisplayName: 'Build with coverage'
+    buildName:        BuildCoverage
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+- template: run-remote-task.yml
+  parameters:
+    name: TestsCoverageNonPerf
+    displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+      STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
+      export GCOV_PREFIX=/root/mscclpp
+      export GCOV_PREFIX_STRIP=$STRIP_COUNT
+
+      echo "Running unit_tests..."
+      ./build/bin/unit_tests
+      echo "unit_tests: PASSED"
+
+      echo "Running mp_unit_tests -np 2..."
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 2: PASSED"
+
+      echo "Running mp_unit_tests -np 4..."
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
+      echo "mp_unit_tests -np 4: PASSED"
+
+- template: run-remote-task.yml
+  parameters:
+    name: CaptureCoverage
+    displayName: Capture coverage data with lcov
+    remoteScript: |
+      BUILD_PREFIX=$(cat build/BUILD_PREFIX)
+
+      GCOV_TOOL_ARG=""
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
+        GCOV_WRAPPER=$(mktemp)
+        printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
+        chmod +x "$GCOV_WRAPPER"
+        GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
+      fi
+
+      lcov --version
+      LCOV_CAPTURE_ARGS=""
+      if lcov --help 2>&1 | grep -q "inconsistent"; then
+        LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
+      if [ ! -s coverage.info ]; then
+        echo "ERROR: coverage.info was not generated."
+        exit 1
+      fi
+
+      lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
+      lcov --list coverage.info
+      ls -la coverage.info
+
+- task: Bash@3
+  name: FetchCoverage
+  displayName: Fetch coverage data from remote VM
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
+      SSH_OPTION="StrictHostKeyChecking=no"
+      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      HOST=$(head -1 ${HOSTFILE})
+      ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
+        'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
+      scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: UploadCodecov
+  displayName: Upload coverage to Codecov
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      curl -Os https://cli.codecov.io/latest/linux/codecov
+      chmod +x codecov
+      ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml
new file mode 100644
index 00000000..fc116acf
--- /dev/null
+++ b/.azure-pipelines/templates/deploy.yml
@@ -0,0 +1,131 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+# Build parameters
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+  default: ''
+- name: buildType
+  type: string
+  default: 'Release'
+- name: buildTests
+  type: string
+  default: 'true'
+- name: cmakeArgs
+  type: string
+  default: ''
+- name: buildName
+  type: string
+  default: 'Build'
+- name: buildDisplayName
+  type: string
+  default: 'Build'
+# Deploy parameters
+- name: deployArgs
+  type: string
+  default: ''
+
+steps:
+# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
+- task: Bash@3
+  name: EnsureAzureCLI
+  displayName: Ensure Azure CLI Installed
+  inputs:
+    targetType: inline
+    script: |
+      set -e
+      if command -v az >/dev/null 2>&1; then
+        az version >/dev/null
+        exit 0
+      fi
+      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+# 1. Build
+- task: Bash@3
+  name: ${{ parameters.buildName }}
+  displayName: ${{ parameters.buildDisplayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      BUILD_TESTS_ARG=""
+      if [ "${{ parameters.buildTests }}" = "true" ]; then
+        BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
+      fi
+
+      GPU_ARCH_ARG=""
+      if [ -n "${{ parameters.gpuArch }}" ]; then
+        GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
+      fi
+
+      CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
+      if [ "${{ parameters.platform }}" = "rocm" ]; then
+        eval CXX=/opt/rocm/bin/hipcc cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_ROCM=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      else
+        eval cmake \
+          -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
+          -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+          -DMSCCLPP_USE_CUDA=ON \
+          ${BUILD_TESTS_ARG} \
+          ${GPU_ARCH_ARG} \
+          ${CMAKE_EXTRA_ARGS} ..
+      fi
+      make -j
+      cd ..
+      pwd > build/BUILD_PREFIX
+      echo "=== Build artifacts ==="
+      ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
+      du -sh build/bin/* 2>/dev/null || true
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+# 2. Download SSH key + install packages + start VMSS
+- task: DownloadSecureFile@1
+  name: SshKeyFile
+  displayName: Download key file
+  inputs:
+    secureFile: mscclpp.pem
+
+- task: Bash@3
+  name: InstallPackages
+  displayName: Install Packages
+  inputs:
+    targetType: 'inline'
+    script: |
+      sudo apt-get update -y
+      sudo apt-get install pssh -y
+
+- task: AzureCLI@2
+  name: StartVMSS
+  displayName: Start VMSS
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
+
+# 3. Deploy test environment
+- task: Bash@3
+  name: DeployTestEnv
+  displayName: Deploy Test Env
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: ${{ parameters.deployArgs }}
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml
deleted file mode 100644
index 99ed6d04..00000000
--- a/.azure-pipelines/templates/integration-test.yaml
+++ /dev/null
@@ -1,242 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: perfBaselineFile
-  type: string
-  default: 'test/deploy/perf_ndmv4.jsonl'
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: inline
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllGatherTest
-  displayName: Run mscclpp AllGather test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        set -e;                                                   \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl;       \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: SendRecvTest
-  displayName: Run mscclpp SendRecv test
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllReduceTest
-  displayName: Run mscclpp AllReduce test
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl;                 \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl;            \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl;  \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: AllToAll
-  displayName: Run mscclpp AllToAll test
-  inputs:
-    targetType: 'inline'
-    script: |
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
-        mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: CheckPerfNumber
-  displayName: Check collective primitives performance
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        cd /root/mscclpp;                                         \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PythonAllReduceBenchmark
-  displayName: Python Allreduce Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        set -e;                                                    \
-        cd /root/mscclpp;                                          \
-        export PATH=/usr/local/mpi/bin:\$PATH;                     \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        python3 -m pip install .;                                     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: FifoPerfBenchmark
-  displayName: FIFO Performance Benchmark
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        set -e;                                                   \
-        export PATH=/usr/local/mpi/bin:\$PATH;                    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        ./build/bin/perf/fifo_test"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml
new file mode 100644
index 00000000..b686e4f2
--- /dev/null
+++ b/.azure-pipelines/templates/integration-test.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: perfBaselineFile
+  type: string
+  default: 'test/deploy/perf_ndmv4.jsonl'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllGatherTest
+    displayName: Run mscclpp AllGather test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: SendRecvTest
+    displayName: Run mscclpp SendRecv test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllReduceTest
+    displayName: Run mscclpp AllReduce test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: AllToAll
+    displayName: Run mscclpp AllToAll test
+    remoteScript: |
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+- template: run-remote-task.yml
+  parameters:
+    name: CheckPerfNumber
+    displayName: Check collective primitives performance
+    remoteScript: |
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    remoteScript: |
+      python3 -m pip install .
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml
deleted file mode 100644
index ef4a9fa8..00000000
--- a/.azure-pipelines/templates/nccl-test.yaml
+++ /dev/null
@@ -1,282 +0,0 @@
-# .azure-pipelines/templates/nccl-test.yaml
-# ----------------------------------------
-# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: nvccGencode
-  type: string
-  default: "-gencode=arch=compute_80,code=sm_80"
-
-steps:
-- checkout: self
-- checkout: git://One/msccl-users
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: mscclpp/test/deploy/deploy.sh
-    arguments: nccltest-single-node
-    workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
-
-- task: Bash@3
-  name: CopyMscclUsers
-  displayName: Copy msccl-users
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      DST_DIR="/tmp/mscclpp/msccl-users"
-      parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: GenerateExecutionFile
-#   displayName: Generate execution file
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp/msccl-users;  \
-#         mkdir -p execution-files;      \
-#         cd /root/mscclpp/msccl-users;  \
-#         bash algos/mscclpp_a100/generate_execution_plan.sh"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNcclTests
-  displayName: Install NCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd; git clone https://github.com/NVIDIA/nccl-tests.git;    \
-        cd nccl-tests;                                             \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllReduceTest
-#   displayName: Run NCCL AllReduce Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclAllGatherTest
-#   displayName: Run NCCL AllGather Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterTest
-#   displayName: Run NCCL Reduce Scatter Test
-#   inputs:
-#     targetType: inline
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN  -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: InstallNccl
-  displayName: Install NCCL
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
-        if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
-        cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
-        cd nccl;                                                   \
-        make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllGatherFallbaclkToNcclTest
-  displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclAllReduceFallbaclkToNcclTest
-  displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunNcclBroadcastFallbaclkToNcclTest
-  displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-# - task: Bash@3
-#   name: RunNcclReduceScatterFallbaclkToNcclTest
-#   displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
-#   inputs:
-#     targetType: 'inline'
-#     script: |
-#       set -e
-#       HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-#       ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
-#       SSH_OPTION="StrictHostKeyChecking=no"
-#       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-#       parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-#         -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-#         cd /root/mscclpp;                                         \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";                                                                 \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;                                                                            \
-#         echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
-#         mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-#     workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
new file mode 100644
index 00000000..211e2393
--- /dev/null
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -0,0 +1,76 @@
+# .azure-pipelines/templates/nccl-test.yml
+# ----------------------------------------
+# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: nvccGencode
+  type: string
+  default: "-gencode=arch=compute_80,code=sm_80"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    deployArgs:       'nccltest-single-node'
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNcclTests
+    displayName: Install NCCL Tests
+    remoteScript: |
+      cd
+      git clone https://github.com/NVIDIA/nccl-tests.git
+      cd nccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallNccl
+    displayName: Install NCCL
+    remoteScript: |
+      LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
+      if [ -z "$LATEST_TAG" ]; then
+        echo "Failed to fetch latest NCCL tag"
+        exit 1
+      fi
+      cd
+      git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
+      cd nccl
+      make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllGatherFallbaclkToNcclTest
+    displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclAllReduceFallbaclkToNcclTest
+    displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunNcclBroadcastFallbaclkToNcclTest
+    displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml
deleted file mode 100644
index 040605df..00000000
--- a/.azure-pipelines/templates/rccl-test.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-# .azure-pipelines/templates/rccl-test.yaml
-# ------------------------------------------------
-# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
-#
-# Parameters:
-#   subscription     – Azure subscription to use for VMSS start/stop
-#   vmssName         – VMSS name to start/stop
-#   sshKeySecureFile – the secureFile name for your SSH key
-#   gpuArch          – GPU architecture (e.g. gfx942)
-
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-  default: "gfx942"
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true rocm"
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-
-- task: Bash@3
-  name: InstallRcclTests
-  displayName: Install RCCL Tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-        cd;                                                        \
-        git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
-        cd rocm-systems;                                           \
-        git sparse-checkout init --cone;                           \
-        git sparse-checkout set projects/rccl-tests;               \
-        git checkout;                                              \
-        cd projects/rccl-tests;                                    \
-        MPI=1 MPI_HOME=/usr/local/mpi make -j"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunRcclAllGatherTest
-  displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: RunRcclAllReduceTest
-  displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      ROOT_DIR=$(System.DefaultWorkingDirectory)
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"  \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        cd /root/mscclpp;                                         \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN  /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20;           \
-        echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
-        mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml
new file mode 100644
index 00000000..8e247161
--- /dev/null
+++ b/.azure-pipelines/templates/rccl-test.yml
@@ -0,0 +1,63 @@
+# .azure-pipelines/templates/rccl-test.yml
+# ------------------------------------------------
+# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
+#
+# Parameters:
+#   subscription     – Azure subscription to use for VMSS start/stop
+#   vmssName         – VMSS name to start/stop
+#   gpuArch          – GPU architecture (e.g. gfx942)
+
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+  default: "gfx942"
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         rocm
+    gpuArch:          ${{ parameters.gpuArch }}
+    buildTests:       false
+    deployArgs:       'single-node-test true rocm'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: InstallRcclTests
+    displayName: Install RCCL Tests
+    remoteScript: |
+      cd
+      git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
+      cd rocm-systems
+      git sparse-checkout init --cone
+      git sparse-checkout set projects/rccl-tests
+      git checkout
+      cd projects/rccl-tests
+      MPI=1 MPI_HOME=/usr/local/mpi make -j
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllGatherTest
+    displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: run-remote-task.yml
+  parameters:
+    name: RunRcclAllReduceTest
+    displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
+    remoteScript: |
+      mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+      mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
new file mode 100644
index 00000000..37b3a7d7
--- /dev/null
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -0,0 +1,27 @@
+parameters:
+- name: name
+  type: string
+  default: ''
+- name: displayName
+  type: string
+- name: runRemoteArgs
+  type: string
+  default: ''
+- name: remoteScript
+  type: string
+- name: workingDirectory
+  type: string
+  default: '$(System.DefaultWorkingDirectory)'
+
+steps:
+- task: Bash@3
+  ${{ if ne(parameters.name, '') }}:
+    name: ${{ parameters.name }}
+  displayName: ${{ parameters.displayName }}
+  inputs:
+    targetType: 'inline'
+    script: |
+      test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
+      ${{ parameters.remoteScript }}
+      REMOTE_CMD
+    workingDirectory: ${{ parameters.workingDirectory }}
diff --git a/.azure-pipelines/templates/stop.yml b/.azure-pipelines/templates/stop.yml
new file mode 100644
index 00000000..40498c29
--- /dev/null
+++ b/.azure-pipelines/templates/stop.yml
@@ -0,0 +1,20 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: resourceGroup
+  type: string
+  default: mscclpp
+
+steps:
+- task: AzureCLI@2
+  name: StopVMSS
+  displayName: Deallocate VMSS
+  condition: always()
+  inputs:
+    azureSubscription: ${{ parameters.subscription }}
+    scriptType: bash
+    scriptLocation: inlineScript
+    inlineScript: |
+      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml
deleted file mode 100644
index 0d97f9fc..00000000
--- a/.azure-pipelines/templates/ut-no-ib-env.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: StopContainer
-  displayName: Stop existing container
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-        "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
-      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-  
-- task: Bash@3
-  name: BuildWithIb
-  displayName: Rebuild with IB
-  inputs:
-    targetType: 'inline'
-    script: |
-      rm -rf build && mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: DeployTestEnvWithIb
-  displayName: Deploy Test Env (with IB build)
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: single-node-test false
-    workingDirectory: $(System.DefaultWorkingDirectory)
-
-- task: Bash@3
-  name: PyTestsWithIbBuildDisableIb
-  displayName: Run pytests (IB build, IB tests disabled)
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
\ No newline at end of file
diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml
new file mode 100644
index 00000000..a62f1a77
--- /dev/null
+++ b/.azure-pipelines/templates/ut-no-ib-env.yml
@@ -0,0 +1,95 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_USE_IB=OFF'
+    deployArgs:       'single-node-test false'
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: run-remote-task.yml
+  parameters:
+    name: StopContainer
+    displayName: Stop existing container
+    runRemoteArgs: '--no-docker --no-log'
+    remoteScript: |
+      sudo docker stop mscclpp-test || true
+      sudo docker rm mscclpp-test || true
+
+- task: Bash@3
+  displayName: Remove generated SSH key files
+  inputs:
+    targetType: 'inline'
+    script: |
+      rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+  
+- task: Bash@3
+  name: BuildWithIb
+  displayName: Rebuild with IB
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      rm -rf build
+      mkdir -p build && cd build
+      cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DMSCCLPP_BYPASS_GPU_CHECK=ON \
+        -DMSCCLPP_USE_CUDA=ON \
+        -DMSCCLPP_BUILD_TESTS=ON \
+        -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      make -j
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: DeployTestEnvWithIb
+  displayName: Deploy Test Env (with IB build)
+  inputs:
+    targetType: filePath
+    filePath: test/deploy/deploy.sh
+    arguments: single-node-test false
+    workingDirectory: $(System.DefaultWorkingDirectory)
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTestsWithIbBuildDisableIb
+    displayName: Run pytests (IB build, IB tests disabled)
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml
deleted file mode 100644
index 5c35317e..00000000
--- a/.azure-pipelines/templates/ut-npkit.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: gpuArch
-  type: string
-
-
-steps:
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: inline
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        set -e;                                                       \
-        cd /root/mscclpp;                                             \
-        mkdir -p build && cd build;                                   \
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
-        make -j"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: 'inline'
-    script: |
-      # set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
-        export PATH=/usr/local/mpi/bin:\$PATH; \
-        export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump;    \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json;  \
-        grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
-        rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output;     \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json';      \
-        python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output;  \
-        grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json;          \
-        grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;   \
-        grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json;    \
-        grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml
new file mode 100644
index 00000000..e53b5cf5
--- /dev/null
+++ b/.azure-pipelines/templates/ut-npkit.yml
@@ -0,0 +1,57 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: gpuArch
+  type: string
+
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    cmakeArgs:        '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
+    deployArgs:       'single-node-test'
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
+      rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
+      python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
+      grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+      grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml
deleted file mode 100644
index 2086fd0a..00000000
--- a/.azure-pipelines/templates/ut.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-parameters:
-- name: subscription
-  type: string
-- name: vmssName
-  type: string
-- name: sshKeySecureFile
-  type: string
-- name: platform
-  type: string
-  default: 'cuda'
-- name: gpuArch
-  type: string
-
-steps:
-- task: Bash@3
-  name: Build
-  displayName: Build
-  inputs:
-    targetType: 'inline'
-    script: |
-      mkdir build && cd build
-      if [ "${{ parameters.platform }}" == "rocm" ]; then
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      else
-        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
-      fi
-      make -j
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: DownloadSecureFile@1
-  name: SshKeyFile
-  displayName: Download key file
-  inputs:
-    secureFile: ${{ parameters.sshKeySecureFile }}
-
-- task: Bash@3
-  name: InstallPackages
-  displayName: Install Packages
-  inputs:
-    targetType: 'inline'
-    script: |
-      sudo apt-get update -y
-      sudo apt-get install pssh -y
-      curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
-
-- task: AzureCLI@2
-  name: StartVMSS
-  displayName: Start VMSS
-  inputs:
-    azureSubscription: ${{ parameters.subscription }} 
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss start --name ${{ parameters.vmssName }}  --resource-group mscclpp
-
-- task: Bash@3
-  name: DeployTestEnv
-  displayName: Deploy Test Env
-  inputs:
-    targetType: filePath
-    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test true ${{ parameters.platform }}"
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-
-- task: Bash@3
-  name: UnitTests
-  displayName: Run mscclpp unit tests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        ./build/bin/unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: MpUnitTests
-  displayName: Run mscclpp multi-process unit tests
-  inputs:
-    targetType: 'inline'
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .    \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "    \
-        export PATH=/usr/local/mpi/bin:\$PATH;                        \
-        cd /root/mscclpp;                                             \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
-        mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests;  \
-        mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: Bash@3
-  name: PyTests
-  displayName: Run pytests
-  inputs:
-    targetType: inline
-    script: |
-      set -e
-      HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
-      SSH_OPTION="StrictHostKeyChecking=no"
-      KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-      : > azureuser@10.0.0.4
-      tail -f azureuser@10.0.0.4 &
-      CHILD_PID=$!
-      parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o .     \
-        -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "     \
-        export PATH=/usr/local/mpi/bin:\$PATH                          \
-        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
-        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
-      kill $CHILD_PID
-    workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-- task: AzureCLI@2
-  name: StopVMSS
-  displayName: Deallocate VMSS
-  condition: always()
-  inputs:
-    azureSubscription: ${{ parameters.subscription }}
-    scriptType: bash
-    scriptLocation: inlineScript
-    inlineScript: |
-      az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml
new file mode 100644
index 00000000..9d17e923
--- /dev/null
+++ b/.azure-pipelines/templates/ut.yml
@@ -0,0 +1,48 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: UnitTests
+    displayName: Run mscclpp unit tests
+    remoteScript: |
+      ./build/bin/unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: MpUnitTests
+    displayName: Run mscclpp multi-process unit tests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
+      mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
+
+- template: run-remote-task.yml
+  parameters:
+    name: PyTests
+    displayName: Run pytests
+    remoteScript: |
+      mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml
deleted file mode 100644
index 8b0aed1a..00000000
--- a/.azure-pipelines/ut-rocm.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - apps/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - apps/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
-- job: UnitTestMI300X
-  timeoutInMinutes: 40
-  pool:
-    name: msccl-ci-mi300x
-  strategy:
-    matrix:
-      rocm6_2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  container:
-    image: $(containerImage)
-
-  steps:
-  - template: templates/ut.yaml
-    parameters:
-      subscription:     mscclpp-ci-mi300x
-      vmssName:         mscclpp-mi300x-ci
-      sshKeySecureFile: mscclpp.pem
-      platform:         rocm
-      gpuArch:          gfx942
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 4aac07e6..4e6f96b1 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -37,17 +37,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '80'
 
 - job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut.yaml
+  - template: templates/ut.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestWithNpKitH100
@@ -99,17 +96,16 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-npkit.yaml
+  - template: templates/ut-npkit.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
 
 - job: UnitTestNoIBEnv
@@ -121,15 +117,34 @@ jobs:
   strategy:
     matrix:
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
 
   container:
     image: $(containerImage)
 
   steps:
-  - template: templates/ut-no-ib-env.yaml
+  - template: templates/ut-no-ib-env.yml
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
-      sshKeySecureFile: mscclpp.pem
       gpuArch:          '90'
+
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      platform:         rocm
+      gpuArch:          gfx942
diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..a98f1e89
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,24 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        target: 68%
+        threshold: 1%
+    patch:
+      default:
+        target: 80%
+
+flag_management:
+  default_rules:
+    carryforward: true
+
+ignore:
+  - "test/"
+  - "examples/"
+  - "python/"
+  - "tools/"
+  - "docs/"
+  - "docker/"
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index db3b488a..fb065141 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        version: [ 'cuda11.8', 'cuda12.8' ]
+        version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - name: Checkout repository
@@ -62,7 +62,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
+        cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j4
 
     - name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
     - name: Build
       run: |
         rm -rf build && mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
+        CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
         make -j4
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml
similarity index 100%
rename from .github/workflows/doc-build.yaml
rename to .github/workflows/doc-build.yml
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
deleted file mode 100644
index 900e8aba..00000000
--- a/.github/workflows/integration-test-backup.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: IntegrationTest
-
-on: workflow_dispatch
-
-jobs:
-  IntegrationTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-
-      - name: Lock GPU clock frequency
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: Run mscclpp AllGather test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-
-      - name: Run mscclpp SendRecv test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-
-      - name: Run mscclpp AllReduce test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
-
-      - name: Run mscclpp AllToAll test
-        run: |
-          set -e
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
-          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
-
-      - name: Check collective primitives performance
-        run: |
-          set -e
-          python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml
index 5947b087..a9187e96 100644
--- a/.github/workflows/mscclpp-lang.yml
+++ b/.github/workflows/mscclpp-lang.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
         fail-fast: false
         matrix:
-          version: [ 'cuda11.8', 'cuda12.8' ]
+          version: [ 'cuda11.8', 'cuda12.9' ]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
deleted file mode 100644
index 8849c353..00000000
--- a/.github/workflows/ut-backup.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: UnitTest
-
-on: workflow_dispatch
-
-jobs:
-  UnitTest:
-    runs-on: [ self-hosted, A100 ]
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        cuda: [ cuda11.8, cuda12.2 ]
-
-    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
-      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Build
-        run: |
-          mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release ..
-          make -j
-        working-directory: ${{ github.workspace }}
-
-      - name: LockGPUClock
-        run: |
-          sudo nvidia-smi -pm 1
-          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-          done
-
-      - name: UnitTests
-        run: |
-          ./build/bin/unit_tests
-
-      - name: MpUnitTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
-          mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
-
-      - name: PyTests
-        run: |
-          set -e
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
diff --git a/.gitignore b/.gitignore
index ed3b94c4..74307e67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .vscode/
 build/
+build_coverage/
 __pycache__
 .*.swp
 *.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ff7b075..9db54d15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 cmake_minimum_required(VERSION 3.25)
 project(mscclpp LANGUAGES CXX)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
+option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
 option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
 
@@ -99,6 +100,62 @@ else()
         message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
     endif()
 endif()
+
+# Code coverage setup
+if(MSCCLPP_ENABLE_COVERAGE)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
+    endif()
+    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        message(STATUS "Code coverage enabled")
+        
+        # Add coverage flags to C++ targets only (not CUDA)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
+        add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
+        
+        # Find lcov
+        find_program(LCOV_PATH lcov)
+        
+        if(NOT LCOV_PATH)
+            message(WARNING "lcov not found. Install lcov to generate coverage reports.")
+        endif()
+        
+        if(LCOV_PATH)
+            # Add coverage target
+            add_custom_target(coverage
+                COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
+                COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
+                COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
+                COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
+                
+                COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
+                
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Generating code coverage report"
+            )
+            
+            # Add coverage clean target
+            add_custom_target(coverage-clean
+                COMMAND ${CMAKE_COMMAND} -E remove coverage.info
+                COMMAND ${LCOV_PATH} --directory . --zerocounters
+                WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+                COMMENT "Cleaning coverage data"
+            )
+        endif()
+    else()
+        message(WARNING "Code coverage is only supported with GCC or Clang compilers")
+    endif()
+endif()
 if(MSCCLPP_GPU_ARCHS)
     string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
     string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
diff --git a/README.md b/README.md
index 5366f5b5..58586a30 100644
--- a/README.md
+++ b/README.md
@@ -3,15 +3,16 @@
 [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
 [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
 [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
-[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
+[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
 
 | Testing Pipelines        | Build Status      |
 |--------------------------|-------------------|
-| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
-| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
-| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
-| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
+| Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Unit Tests (ROCm)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
+| NCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
+| RCCL Tests               | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
 
 A GPU-driven communication stack for scalable AI applications.
 
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
index 3aa81422..7c6c927e 100644
--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         htop \
-        lcov \
         vim \
         && \
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install lcov 2.2
+RUN LCOV_VERSION="2.2" && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        cpanminus \
+        gcc \
+        make \
+        perl \
+        && \
+    cpanm --notest \
+        Capture::Tiny \
+        DateTime \
+        JSON::XS \
+        Memory::Process \
+        TimeDate \
+        && \
+    cd /tmp && \
+    curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
+    tar xzf lcov.tar.gz && \
+    cd lcov-${LCOV_VERSION} && \
+    make install && \
+    cd / && rm -rf /tmp/lcov* && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
 # Install CMake 3.26.4
 RUN OS_ARCH=$(uname -m) && \
     CMAKE_VERSION="3.26.4" && \
@@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
         export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
     fi && \
     pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r python/requirements_${target_type}.txt
+    pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
+    pip install --no-cache-dir coverage xlsxwriter
 
 # Cleanup
 RUN rm -rf /tmp/mscclpp
diff --git a/docker/build.sh b/docker/build.sh
index 63552f74..89568e19 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,27 +4,27 @@ set -e
 
 declare -A baseImageTable
 baseImageTable=(
-    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
-    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
-    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
-    ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
     ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
     ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
-    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
+    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
     ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
     ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
-    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
-    ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
+    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
+    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
+    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
+    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
+    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
 declare -A ofedVersionTable
 ofedVersionTable=(
+    ["cuda11.8"]="23.07-0.5.1.2"
     ["cuda12.4"]="23.07-0.5.1.2"
     ["cuda12.8"]="24.10-1.1.4.0"
     ["cuda12.9"]="24.10-1.1.4.0"
@@ -36,7 +36,7 @@ TARGET=${1}
 OS_ARCH=$(uname -m)
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
diff --git a/docs/quickstart.md b/docs/quickstart.md
index ac1b7d6b..b7a68050 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
 
 ```bash
 # For NVIDIA platforms
-$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
 # For AMD platforms
 $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
 ```
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
 For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
 
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
 
 Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
 Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
 ```bash
-export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
 mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
 ```
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6452ebf8..82b799dc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
-find_package(MPI)
+find_package(MPI REQUIRED)
 
 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
 if(MSCCLPP_USE_IB)
     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
-set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
 set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 
@@ -17,6 +16,7 @@ if(MSCCLPP_USE_ROCM)
     foreach(arch ${MSCCLPP_GPU_ARCHS})
         add_compile_options(--offload-arch=${arch})
     endforeach()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
 endif()
 
 function(add_test_executable name sources)
@@ -38,28 +38,25 @@ add_test_executable(executor_test executor_test.cc)
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
 include(CTest)
-include(FetchContent)
-FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
-option(INSTALL_GTEST OFF)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
+
+# Build test framework library
+add_library(test_framework STATIC framework.cc)
+target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
+target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
 
 # Unit tests
 add_executable(unit_tests)
-target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
+target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
 target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(unit)
-gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME unit_tests COMMAND unit_tests)
 
 # Multi-process unit tests
 add_executable(mp_unit_tests)
-target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
+target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
 target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
 add_subdirectory(mp_unit)
-gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
+add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)
 
 # mscclpp-test
 add_subdirectory(mscclpp-test)
-
-# Performance tests
-add_subdirectory(perf)
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index b26ff1a8..1f1d0e52 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,4 +1,4 @@
-set -e
+set -ex
 
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh
new file mode 100755
index 00000000..b646ea92
--- /dev/null
+++ b/test/deploy/run-remote.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Run a command on remote CI VMs via parallel-ssh.
+# By default, runs inside the mscclpp-test docker container.
+#
+# Usage:
+#   run-remote.sh [OPTIONS] < <command_script>
+#
+# Options:
+#   --no-docker   Run command directly on the host, not inside docker
+#   --no-log      Don't tail the log file in the background
+#   --hostfile    Override hostfile path (default: test/deploy/hostfile_ci)
+#   --host        Run command on a single host (uses parallel-ssh -H)
+#   --user        SSH user when using --host or custom hostfile
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
+SSH_OPTION="StrictHostKeyChecking=no"
+KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
+
+USE_DOCKER=true
+USE_LOG=true
+TARGET_HOST=""
+REMOTE_USER=""
+
+usage() {
+    echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
+}
+
+require_value() {
+    local opt="$1"
+    local val="$2"
+    if [ -z "$val" ]; then
+        echo "Missing value for ${opt}" >&2
+        exit 1
+    fi
+}
+
+while [[ "$1" == --* ]]; do
+    case "$1" in
+        --no-docker) USE_DOCKER=false; shift ;;
+        --no-log)    USE_LOG=false; shift ;;
+        --hostfile)
+            require_value "--hostfile" "${2-}"
+            HOSTFILE="$2"
+            shift 2
+            ;;
+        --host)
+            require_value "--host" "${2-}"
+            TARGET_HOST="$2"
+            shift 2
+            ;;
+        --user)
+            require_value "--user" "${2-}"
+            REMOTE_USER="$2"
+            shift 2
+            ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ $# -ne 0 ] || [ -t 0 ]; then
+    usage
+    exit 1
+fi
+
+CMD=$(cat)
+if [ -z "$CMD" ]; then
+    usage
+    exit 1
+fi
+CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
+
+PSSH_TARGET_ARGS=()
+if [ -n "$TARGET_HOST" ]; then
+    PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
+else
+    PSSH_TARGET_ARGS=(-h "$HOSTFILE")
+fi
+
+PSSH_USER_ARGS=()
+if [ -n "$REMOTE_USER" ]; then
+    PSSH_USER_ARGS=(-l "$REMOTE_USER")
+fi
+
+PSSH_COMMON=(
+    -t 0
+    "${PSSH_TARGET_ARGS[@]}"
+    "${PSSH_USER_ARGS[@]}"
+    -x "-i ${KeyFilePath}"
+    -O "$SSH_OPTION"
+)
+
+if $USE_DOCKER; then
+    INNER="set -euxo pipefail;"
+    INNER+=" cd /root/mscclpp;"
+    INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
+    INNER+=" CMD_B64='${CMD_B64}';"
+    INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
+
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "sudo docker exec mscclpp-test bash -c \"${INNER}\""
+else
+    parallel-ssh -i "${PSSH_COMMON[@]}" \
+        "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
+fi
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 488fa81f..0c05a090 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,6 +1,5 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
-export PATH=/usr/local/mpi/bin:$PATH
 
 function run_mscclpp_test()
 {
diff --git a/test/executor_test.cc b/test/executor_test.cc
index 0e7869ab..2378e7ff 100644
--- a/test/executor_test.cc
+++ b/test/executor_test.cc
@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
 
 int main(int argc, char* argv[]) {
   if (argc != 5 && argc != 6) {
-    std::cerr << "Usage: " << argv[0] << " <buffer size>"
-              << " <execution plan path>"
-              << " <number of iterations>"
-              << " <number of graph iterations>"
-              << " (optional) <packet type>" << std::endl;
+    std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
+              << " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
     return 1;
   }
 
@@ -142,7 +139,8 @@ int main(int argc, char* argv[]) {
     NpKit::Shutdown();
   }
 
-  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
+  double latencyUs = deltaSec * 1.e6;
+  std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
   MPI_Finalize();
   return 0;
 }
diff --git a/test/framework.cc b/test/framework.cc
new file mode 100644
index 00000000..73cf1272
--- /dev/null
+++ b/test/framework.cc
@@ -0,0 +1,323 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "framework.hpp"
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+namespace mscclpp {
+namespace test {
+
+// Global state
+static int gMpiRank = 0;
+static int gMpiSize = 1;
+static bool gMpiInitialized = false;
+static bool gCurrentTestPassed = true;
+static std::string gCurrentTestFailureMessage;
+static std::string gCurrentTestName;
+
+std::string currentTestName() { return gCurrentTestName; }
+
+namespace utils {
+
+void initializeMPI(int argc, char* argv[]) {
+  if (gMpiInitialized) return;
+
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(&argc, &argv);
+  }
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
+  MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
+  gMpiInitialized = true;
+}
+
+static void finalizeMPI() {
+  if (!gMpiInitialized) return;
+
+  MPI_Finalize();
+  gMpiInitialized = false;
+}
+
+bool isMainRank() { return gMpiRank == 0; }
+
+int getMPIRank() { return gMpiRank; }
+
+int getMPISize() { return gMpiSize; }
+
+void cleanupMPI() { finalizeMPI(); }
+
+void reportFailure(const char* file, int line, const std::string& message) {
+  gCurrentTestPassed = false;
+  std::ostringstream oss;
+  oss << file << ":" << line << ": " << message;
+  if (!gCurrentTestFailureMessage.empty()) {
+    gCurrentTestFailureMessage += "\n";
+  }
+  gCurrentTestFailureMessage += oss.str();
+  std::cerr << oss.str() << std::endl;
+}
+
+void reportSuccess() {
+  gCurrentTestPassed = true;
+  gCurrentTestFailureMessage.clear();
+}
+
+// Timer implementation
+Timer::Timer() : isRunning_(false) {}
+
+void Timer::start() {
+  startTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = true;
+}
+
+void Timer::stop() {
+  endTime_ = std::chrono::high_resolution_clock::now();
+  isRunning_ = false;
+}
+
+double Timer::elapsedMicroseconds() const {
+  if (isRunning_) {
+    auto now = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
+}
+
+double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
+
+double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
+
+void cudaCheck(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    std::string msg =
+        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace utils
+
+// TestRegistry implementation
+TestRegistry& TestRegistry::instance() {
+  static TestRegistry registry;
+  return registry;
+}
+
+void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                                bool isPerfTest) {
+  tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
+}
+
+void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
+
+// Returns true if the test should run given the filter string.
+// Filter syntax:
+//   ""          -> run all
+//   "Pattern"   -> run only tests whose full name contains Pattern
+//   "-Pattern"  -> run all tests EXCEPT those whose full name contains Pattern
+static bool matchesFilter(const std::string& fullName, const std::string& filter) {
+  if (filter.empty()) return true;
+  if (filter[0] == '-') {
+    // Negative filter: exclude tests matching any comma-separated pattern
+    std::string patterns = filter.substr(1);
+    size_t pos = 0;
+    while (pos < patterns.size()) {
+      size_t comma = patterns.find(',', pos);
+      std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
+      if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
+        return false;
+      }
+      pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
+    }
+    return true;
+  }
+  // Positive filter: include only matching tests
+  return fullName.find(filter) != std::string::npos;
+}
+
+int TestRegistry::runAllTests(int argc, char* argv[]) {
+  // Initialize MPI if not already initialized
+  if (!gMpiInitialized) {
+    utils::initializeMPI(argc, argv);
+  }
+
+  // Parse command line arguments
+  std::string filter;
+  bool excludePerfTests = false;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string arg = argv[i];
+    if (arg.find("--filter=") == 0) {
+      filter = arg.substr(9);  // Length of "--filter="
+    } else if (arg == "--filter" && i + 1 < argc) {
+      filter = argv[i + 1];
+      ++i;
+    } else if (arg == "--exclude-perf-tests") {
+      excludePerfTests = true;
+    }
+  }
+
+  // Set up global test environments
+  for (auto* env : environments_) {
+    try {
+      env->SetUp();
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
+      }
+      return 1;
+    }
+  }
+
+  int passed = 0;
+  int failed = 0;
+  int skipped = 0;
+
+  // Count tests to run
+  int totalToRun = 0;
+  int skippedByFilter = 0;
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+    if (excludePerfTests && entry.isPerfTest) {
+      skippedByFilter++;
+      continue;
+    }
+    if (!matchesFilter(fullName, filter)) {
+      skippedByFilter++;
+      continue;
+    }
+    totalToRun++;
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] Running " << totalToRun << " tests";
+    if (skippedByFilter > 0) {
+      std::cout << " (" << skippedByFilter << " skipped by filter)";
+    }
+    std::cout << ".\n";
+  }
+
+  for (const auto& entry : tests_) {
+    std::string fullName = entry.suiteName + "." + entry.testName;
+
+    if (excludePerfTests && entry.isPerfTest) continue;
+    if (!matchesFilter(fullName, filter)) continue;
+
+    gCurrentTestPassed = true;
+    gCurrentTestFailureMessage.clear();
+    gCurrentTestName = fullName;
+
+    if (gMpiRank == 0) {
+      std::cout << "[ RUN      ] " << fullName << std::endl;
+    }
+
+    TestCase* testCase = nullptr;
+    bool testSkipped = false;
+    bool setUpSucceeded = false;
+    try {
+      testCase = entry.factory();
+      testCase->SetUp();
+      setUpSucceeded = true;
+      testCase->TestBody();
+    } catch (const SkipException& e) {
+      gCurrentTestPassed = true;
+      testSkipped = true;
+      if (gMpiRank == 0) {
+        std::cout << "[  SKIPPED ] " << fullName << ": " << e.what() << std::endl;
+      }
+    } catch (const std::exception& e) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = e.what();
+      }
+    } catch (...) {
+      gCurrentTestPassed = false;
+      if (gCurrentTestFailureMessage.empty()) {
+        gCurrentTestFailureMessage = "Unknown exception";
+      }
+    }
+
+    // Always call TearDown() if SetUp() succeeded, even if TestBody() threw
+    if (setUpSucceeded && testCase != nullptr) {
+      try {
+        testCase->TearDown();
+      } catch (const std::exception& e) {
+        // If test already failed, keep original failure message
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
+        }
+      } catch (...) {
+        if (gCurrentTestPassed) {
+          gCurrentTestPassed = false;
+          gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
+        }
+      }
+    }
+
+    delete testCase;
+    gCurrentTestName.clear();
+
+    if (testSkipped) {
+      skipped++;
+      continue;
+    }
+
+    // Synchronize test status across all MPI processes
+    int localPassed = gCurrentTestPassed ? 1 : 0;
+    int globalPassed = 1;
+    if (gMpiInitialized) {
+      MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    } else {
+      globalPassed = localPassed;
+    }
+
+    if (gMpiRank == 0) {
+      if (globalPassed) {
+        std::cout << "[       OK ] " << fullName << std::endl;
+        passed++;
+      } else {
+        std::cout << "[  FAILED  ] " << fullName << std::endl;
+        failed++;
+      }
+    }
+  }
+
+  if (gMpiRank == 0) {
+    std::cout << "[==========] " << totalToRun << " tests ran.\n";
+    if (passed > 0) {
+      std::cout << "[  PASSED  ] " << passed << " tests.\n";
+    }
+    if (skipped > 0) {
+      std::cout << "[  SKIPPED ] " << skipped << " tests.\n";
+    }
+    if (failed > 0) {
+      std::cout << "[  FAILED  ] " << failed << " tests.\n";
+    }
+  }
+
+  // Tear down global test environments (in reverse order)
+  for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
+    try {
+      (*it)->TearDown();
+      delete *it;
+    } catch (const std::exception& e) {
+      if (gMpiRank == 0) {
+        std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
+      }
+    }
+  }
+  environments_.clear();
+
+  return failed > 0 ? 1 : 0;
+}
+
+}  // namespace test
+}  // namespace mscclpp
diff --git a/test/framework.hpp b/test/framework.hpp
new file mode 100644
index 00000000..26a32d5b
--- /dev/null
+++ b/test/framework.hpp
@@ -0,0 +1,405 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
+#define MSCCLPP_TEST_FRAMEWORK_HPP_
+
+#include <mpi.h>
+
+#include <chrono>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <mscclpp/gpu.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace mscclpp {
+namespace test {
+
+// Test case base class
+class TestCase {
+ public:
+  virtual ~TestCase() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+  virtual void TestBody() = 0;
+};
+
+// Environment base class (for global test setup/teardown)
+class Environment {
+ public:
+  virtual ~Environment() = default;
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+};
+
+// Test registry and runner
+class TestRegistry {
+ public:
+  using TestFactory = std::function<TestCase*()>;
+
+  static TestRegistry& instance();
+
+  void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
+                    bool isPerfTest = false);
+  void addEnvironment(Environment* env);
+  int runAllTests(int argc, char* argv[]);
+
+ private:
+  TestRegistry() = default;
+  struct TestEntry {
+    std::string suiteName;
+    std::string testName;
+    TestFactory factory;
+    bool isPerfTest;
+  };
+  std::vector<TestEntry> tests_;
+  std::vector<Environment*> environments_;
+};
+
+// Returns "Suite.Name" for the currently running test, or "" if none.
+std::string currentTestName();
+
+// Utility functions
+namespace utils {
+
+// MPI management
+void initializeMPI(int argc, char* argv[]);
+void cleanupMPI();
+bool isMainRank();
+int getMPIRank();
+int getMPISize();
+
+// Timing utilities
+class Timer {
+ public:
+  Timer();
+  void start();
+  void stop();
+  double elapsedMicroseconds() const;
+  double elapsedMilliseconds() const;
+  double elapsedSeconds() const;
+
+ private:
+  std::chrono::high_resolution_clock::time_point startTime_;
+  std::chrono::high_resolution_clock::time_point endTime_;
+  bool isRunning_;
+};
+
+// CUDA utilities
+void cudaCheck(cudaError_t err, const char* file, int line);
+#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
+
+// Test assertion helpers
+void reportFailure(const char* file, int line, const std::string& message);
+void reportSuccess();
+
+}  // namespace utils
+
+// Exception for test skips
+class SkipException : public std::runtime_error {
+ public:
+  explicit SkipException(const std::string& message) : std::runtime_error(message) {}
+};
+
+// Helper class for FAIL() macro — supports message streaming via operator<<
+class FailHelper {
+ public:
+  explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  FailHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~FailHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
+    } else {
+      ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
+    }
+    throw std::runtime_error("Test failed");
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
+// Usage: SKIP_TEST() << "Reason for skipping";
+class SkipHelper {
+ public:
+  explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
+  template <typename T>
+  SkipHelper& operator<<(const T& value) {
+    message_ << value;
+    return *this;
+  }
+  ~SkipHelper() noexcept(false) {
+    std::string msg = message_.str();
+    if (!msg.empty()) {
+      throw SkipException("Test skipped: " + msg);
+    } else {
+      throw SkipException("Test skipped");
+    }
+  }
+
+ private:
+  const char* file_;
+  int line_;
+  std::ostringstream message_;
+};
+
+// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
+// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
+namespace detail {
+template <typename...>
+using void_t = void;
+
+template <typename T, typename = void_t<>>
+struct FixtureOf {
+  using type = TestCase;
+};
+template <typename T>
+struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
+  using type = T;
+};
+}  // namespace detail
+
+}  // namespace test
+}  // namespace mscclpp
+
+// --- Test registration macros ---
+// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
+// Otherwise, the test inherits from TestCase (no fixture needed).
+
+#define TEST(test_fixture, test_name)                                                                       \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); });            \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+#define PERF_TEST(test_fixture, test_name)                                                                  \
+  class test_fixture;                                                                                       \
+  class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
+   public:                                                                                                  \
+    void TestBody() override;                                                                               \
+  };                                                                                                        \
+  static bool test_fixture##_##test_name##_registered = []() {                                              \
+    ::mscclpp::test::TestRegistry::instance().registerTest(                                                 \
+        #test_fixture, #test_name,                                                                          \
+        []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true);      \
+    return true;                                                                                            \
+  }();                                                                                                      \
+  void test_fixture##_##test_name##_Test::TestBody()
+
+// --- Test runner macro ---
+#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
+
+// Assertion macros
+#define EXPECT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+    }                                                                                                   \
+  } while (0)
+
+#define EXPECT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+    }                                                                                                    \
+  } while (0)
+
+#define EXPECT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define EXPECT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+    }                                                                                \
+  } while (0)
+
+#define EXPECT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_TRUE(condition)                                                                          \
+  do {                                                                                                  \
+    if (!(condition)) {                                                                                 \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
+      throw std::runtime_error("Test assertion failed");                                                \
+    }                                                                                                   \
+  } while (0)
+
+#define ASSERT_FALSE(condition)                                                                          \
+  do {                                                                                                   \
+    if (condition) {                                                                                     \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
+      throw std::runtime_error("Test assertion failed");                                                 \
+    }                                                                                                    \
+  } while (0)
+
+#define ASSERT_EQ(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 == v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " == " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 != v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " != " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_LT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 < v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " < " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_LE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 <= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " <= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_GT(val1, val2)                                                        \
+  do {                                                                               \
+    auto v1 = (val1);                                                                \
+    auto v2 = (val2);                                                                \
+    if (!(v1 > v2)) {                                                                \
+      std::ostringstream oss;                                                        \
+      oss << "Expected: " #val1 " > " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());          \
+      throw std::runtime_error("Test assertion failed");                             \
+    }                                                                                \
+  } while (0)
+
+#define ASSERT_GE(val1, val2)                                                         \
+  do {                                                                                \
+    auto v1 = (val1);                                                                 \
+    auto v2 = (val2);                                                                 \
+    if (!(v1 >= v2)) {                                                                \
+      std::ostringstream oss;                                                         \
+      oss << "Expected: " #val1 " >= " #val2 << "\n  Actual: " << v1 << " vs " << v2; \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());           \
+      throw std::runtime_error("Test assertion failed");                              \
+    }                                                                                 \
+  } while (0)
+
+#define ASSERT_NO_THROW(statement)                                                                         \
+  do {                                                                                                     \
+    try {                                                                                                  \
+      statement;                                                                                           \
+    } catch (const std::exception& e) {                                                                    \
+      std::ostringstream oss;                                                                              \
+      oss << "Expected: " #statement " not to throw\n  Actual: threw " << e.what();                        \
+      ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str());                                \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    } catch (...) {                                                                                        \
+      ::mscclpp::test::utils::reportFailure(                                                               \
+          __FILE__, __LINE__, "Expected: " #statement " not to throw\n  Actual: threw unknown exception"); \
+      throw std::runtime_error("Test assertion failed");                                                   \
+    }                                                                                                      \
+  } while (0)
+
+// --- Test control macros ---
+
+// Fail the current test immediately. Usage: FAIL() << "reason";
+#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
+
+// Skip the current test. Usage: SKIP_TEST() << "reason";
+#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
+
+#endif  // MSCCLPP_TEST_FRAMEWORK_HPP_
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 4bbab2f1..c28087a4 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
   bootstrapTestSendRecv(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithId) {
+TEST(BootstrapTest, WithId) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   mscclpp::UniqueId id;
   if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, WithIpPortPair) {
+TEST(BootstrapTest, WithIpPortPair) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->initialize(gEnv->args["ip_port"]);
   bootstrapTestAll(bootstrap);
 }
 
-TEST_F(BootstrapTest, ResumeWithId) {
+TEST(BootstrapTest, ResumeWithId) {
   // This test may take a few minutes.
   bootstrapTestTimer.set(300);
 
@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
   }
 }
 
-TEST_F(BootstrapTest, ResumeWithIpPortPair) {
+TEST(BootstrapTest, ResumeWithIpPortPair) {
   for (int i = 0; i < 5; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     bootstrap->initialize(gEnv->args["ip_port"]);
   }
 }
 
-TEST_F(BootstrapTest, ExitBeforeConnect) {
+TEST(BootstrapTest, ExitBeforeConnect) {
   auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
   bootstrap->createUniqueId();
 }
 
-TEST_F(BootstrapTest, TimeoutWithId) {
+TEST(BootstrapTest, TimeoutWithId) {
   mscclpp::Timer timer;
 
   // All ranks initialize a bootstrap with their own id (will hang)
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
     // Set bootstrap timeout to 1 second
     bootstrap->initialize(id, 1);
   } catch (const mscclpp::Error& e) {
-    ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
+    ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
   }
 
   // Timeout should be sligtly greater than 1 second
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
   }
 };
 
-TEST_F(BootstrapTest, MPIBootstrap) {
+TEST(BootstrapTest, MPIBootstrap) {
   auto bootstrap = std::make_shared<MPIBootstrap>();
   bootstrapTestAll(bootstrap);
 }
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index 9d83532a..066c5514 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
   return true;
 }
 
-TEST_F(CommunicatorTest, BasicWrite) {
+TEST(CommunicatorTest, BasicWrite) {
   if (gEnv->rank >= numRanksToUse) return;
 
   deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
   }
 }
 
-TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
+TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
   communicator->bootstrap()->barrier();
 }
 
-TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
+TEST(CommunicatorTest, WriteWithHostSemaphores) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;
diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc
index a903ed08..4f3f2545 100644
--- a/test/mp_unit/executor_tests.cc
+++ b/test/mp_unit/executor_tests.cc
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
+#include <unistd.h>
 
 #include <filesystem>
 #include <mscclpp/env.hpp>
@@ -22,7 +23,7 @@ std::string getExecutablePath() {
 
 void ExecutorTest::SetUp() {
   if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
-    GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
+    SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
   }
   MultiProcessTest::SetUp();
 
@@ -49,7 +50,7 @@ void ExecutorTest::TearDown() {
   MultiProcessTest::TearDown();
 }
 
-TEST_F(ExecutorTest, TwoNodesAllreduce) {
+TEST(ExecutorTest, TwoNodesAllreduce) {
   std::string executablePath = getExecutablePath();
   std::filesystem::path path = executablePath;
   std::filesystem::path executionFilesPath =
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index 051030ac..04ab402d 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mpi.h>
 
@@ -18,9 +18,7 @@ void IbTestBase::SetUp() {
 }
 
 void IbPeerToPeerTest::SetUp() {
-#if !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
+  REQUIRE_IBVERBS;
 
   IbTestBase::SetUp();
 
@@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
   qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
+TEST(IbPeerToPeerTest, SimpleSendRecv) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
   }
 }
 
-TEST_F(IbPeerToPeerTest, MemoryConsistency) {
+TEST(IbPeerToPeerTest, MemoryConsistency) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
@@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
   EXPECT_EQ(res, 0);
 }
 
-TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
+TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
   if (gEnv->rank >= 2) {
     // This test needs only two ranks
     return;
diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
index f6ef3aed..318d301a 100644
--- a/test/mp_unit/memory_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 
@@ -8,7 +8,7 @@
 void MemoryChannelOneToOneTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   // The least nelem is 2 for packet ping pong
-  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   int nTries = 1000000;
   communicator->bootstrap()->barrier();
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
+TEST(MemoryChannelOneToOneTest, PutPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
+TEST(MemoryChannelOneToOneTest, GetPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 }
 
 __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
   auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
   packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
   auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
     kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc
index cafd9bbc..2f6dc1ca 100644
--- a/test/mp_unit/mp_unit_tests.cc
+++ b/test/mp_unit/mp_unit_tests.cc
@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
       continue;
     }
 
-    // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
+    // Unrecognized positional token: ignore
   }
 
   return options;
 }
 
 void MultiProcessTestEnv::SetUp() {
-  MPI_Init(NULL, NULL);
+  int initialized = 0;
+  MPI_Initialized(&initialized);
+  if (!initialized) {
+    MPI_Init(NULL, NULL);
+  }
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
   // get the local number of nodes with MPI
@@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() {
 }
 
 int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
   gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
-  ::testing::AddGlobalTestEnvironment(gEnv);
+  ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
   return RUN_ALL_TESTS();
 }
 
-TEST_F(MultiProcessTest, Prelim) {
+TEST(MultiProcessTest, Prelim) {
   // Test to make sure the MPI environment is set up correctly
   ASSERT_GE(gEnv->worldSize, 2);
 }
 
-TEST_F(MultiProcessTest, HostName) {
+TEST(MultiProcessTest, HostName) {
   const size_t maxNameLen = 1024;
   std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
   std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
   }
 }
 
-TEST_F(MultiProcessTest, HostHash) {
+TEST(MultiProcessTest, HostHash) {
   std::vector<uint64_t> buffer(gEnv->worldSize, 0);
   uint64_t hostHash = mscclpp::getHostHash();
   buffer[gEnv->rank] = hostHash;
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index 17046a57..03e4cbde 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
 #define MSCCLPP_MP_UNIT_TESTS_HPP_
 
-#include <gtest/gtest.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
 #include <mscclpp/memory_channel.hpp>
@@ -13,10 +11,18 @@
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "ib.hpp"
 #include "utils_internal.hpp"
 
-class MultiProcessTestEnv : public ::testing::Environment {
+// Skip the current test if IBVerbs is not available in this build
+#if defined(USE_IBVERBS)
+#define REQUIRE_IBVERBS
+#else
+#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
+#endif
+
+class MultiProcessTestEnv : public ::mscclpp::test::Environment {
  public:
   MultiProcessTestEnv(int argc, const char** argv);
 
@@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id);
 int rankToLocalRank(int rank);
 int rankToNode(int rank);
 
-class MultiProcessTest : public ::testing::Test {
+class MultiProcessTest : public ::mscclpp::test::TestCase {
  protected:
   void TearDown() override;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 7cc5954a..764c3299 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <cstdint>
 #include <mscclpp/concurrency_device.hpp>
@@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   proxyService->stopProxy();
 }
@@ -223,8 +209,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000;
 
   // Warm-up
@@ -247,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PingPong) {
+TEST(PortChannelOneToOneTest, PingPong) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
+TEST(PortChannelOneToOneTest, PingPongEthernet) {
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
+TEST(PortChannelOneToOneTest, PingPongWithPoll) {
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerf) {
+TEST(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
+TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{
       .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -407,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
   const int nTries = 1000;
-
   // The least nelem is 2 for packet ping pong
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true>
-      <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
-                                           4 * 1024 * 1024, nTries, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
+  for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
+    *ret = 0;
+    kernelProxyLLPingPong<true>
+        <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    EXPECT_EQ(*ret, 0);
+  }
 
   communicator->bootstrap()->barrier();
 
@@ -471,8 +424,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
 
   proxyService->startProxy();
 
-  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
-  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const std::string testName = ::mscclpp::test::currentTestName();
   const int nTries = 1000000;
 
   // Warm-up
@@ -497,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   proxyService->stopProxy();
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::Host);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPongPerf(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPingPong(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
-#if defined(USE_IBVERBS)
+TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
   testPacketPingPong(true, IbMode::HostNoAtomic);
-#else   // !defined(USE_IBVERBS)
-  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
-#endif  // !defined(USE_IBVERBS)
 }
diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu
index a12919e3..6d913c64 100644
--- a/test/mp_unit/switch_channel_tests.cu
+++ b/test/mp_unit/switch_channel_tests.cu
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <algorithm>
 #include <mscclpp/switch_channel.hpp>
@@ -10,10 +10,10 @@
 void SwitchChannelTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   if (!mscclpp::isNvlsSupported()) {
-    GTEST_SKIP();
+    SKIP_TEST();
   }
   // Use only two ranks
   setNumRanksToUse(2);
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
 void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
 __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
+__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
 
 __global__ void kernelSwitchReduce() {
 #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() {
 #endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
 }
 
-TEST_F(SwitchChannelTest, SimpleAllReduce) {
+__global__ void kernelSwitchReduceTwo() {
+#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan1.broadcast(0, val1);
+  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
+  gConstSwitchChan2.broadcast(0, val2);
+#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
+}
+
+TEST(SwitchChannelTest, SimpleAllReduce) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
@@ -66,22 +77,13 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
   for (int i = 0; i < numRanksToUse; i++) {
     expected += i + 1.0f;
   }
-  ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
+  if (result != expected) {
+    std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
+  }
+  ASSERT_EQ(result, expected);
 }
 
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
-__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
-
-__global__ void kernelSwitchReduceTwo() {
-#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-  auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan1.broadcast(0, val1);
-  auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
-  gConstSwitchChan2.broadcast(0, val2);
-#endif  // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
-}
-
-TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
+TEST(SwitchChannelTest, TwoChannelsSameConnection) {
   if (gEnv->rank >= numRanksToUse) return;
 
   std::vector<int> ranks;
@@ -97,12 +99,9 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
   MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
   MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
 
-  // Connection size must be large enough for two granularity-aligned buffers.
-  // The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
   const size_t connSize = buffer1.bytes() + buffer2.bytes();
   auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
 
-  // Bind two separate buffers to the same connection
   auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
   auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
 
@@ -132,6 +131,6 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
     expected1 += (i + 1.0f) * 1.0f;
     expected2 += (i + 1.0f) * 10.0f;
   }
-  ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
-  ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
+  ASSERT_EQ(result1, expected1);
+  ASSERT_EQ(result2, expected2);
 }
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
deleted file mode 100644
index 6a16c034..00000000
--- a/test/perf/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# Find required packages
-find_package(MPI REQUIRED)
-
-# Note: nlohmann_json::nlohmann_json target is already available from the main project
-
-# Set up common libraries and includes for tests
-set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(MSCCLPP_USE_IB)
-    list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
-endif()
-
-set(PERF_TEST_INC_COMMON 
-    PRIVATE ${PROJECT_SOURCE_DIR}/include 
-    SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
-
-# Function to add a test executable
-function(add_perf_test_executable name sources)
-    if(MSCCLPP_USE_ROCM)
-        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
-    endif()
-    add_executable(${name} ${sources})
-    target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
-    
-    # Link nlohmann_json - use the target from main project
-    target_link_libraries(${name} nlohmann_json::nlohmann_json)
-    
-    if(MSCCLPP_USE_IB)
-        target_compile_definitions(${name} PRIVATE USE_IBVERBS)
-    endif()
-    
-    target_include_directories(${name} ${PERF_TEST_INC_COMMON})
-    target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
-    
-    # Set C++ standard
-    target_compile_features(${name} PRIVATE cxx_std_17)
-
-    set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
-endfunction()
-
-# Add FIFO test  
-add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")
diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu
deleted file mode 100644
index bb77a106..00000000
--- a/test/perf/fifo_test.cu
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <getopt.h>
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mscclpp/fifo.hpp>
-#include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/numa.hpp>
-#include <sstream>
-#include <stdexcept>
-
-#include "framework.hpp"
-
-using namespace mscclpp::test;
-
-// Constants for timeout and trigger calculation
-constexpr uint64_t TIMEOUT_SPINS = 1000000;
-constexpr int MIN_TRIGGERS = 1000;
-constexpr int MIN_WARMUP_TRIGGERS = 100;
-constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
-constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
-
-__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
-
-__global__ void kernelFifoPush(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  mscclpp::ProxyTrigger trigger;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.push(trigger);
-  }
-}
-
-__global__ void kernelFifoPushSync(size_t numTriggers) {
-  mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
-  mscclpp::ProxyTrigger trigger;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (size_t i = 1; i <= numTriggers; ++i) {
-    trigger.fst = i;
-    trigger.snd = tid ^ i;
-    fifo.sync(fifo.push(trigger));
-  }
-}
-
-static void setupCuda(int& cudaDevice, int& numaNode) {
-  utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
-  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
-  mscclpp::numaBind(numaNode);
-}
-
-// Helper function to consume triggers from FIFO
-static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
-  int totalTriggers = numTriggers * parallel;
-  std::unordered_map<int, int> triggerCounts;
-  for (int i = 0; i < totalTriggers; ++i) {
-    mscclpp::ProxyTrigger trigger;
-    uint64_t spin = 0;
-    do {
-      trigger = hostFifo->poll();
-      if (spin++ > TIMEOUT_SPINS) {
-        return false;
-      }
-    } while (trigger.fst == 0 || trigger.snd == 0);
-
-    // Process trigger (see src/proxy.cc)
-    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
-    trigger.snd = trigger.snd ^ trigger.fst;
-    assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
-    triggerCounts[trigger.snd]++;
-    hostFifo->pop();
-  }
-  return true;
-}
-
-// Helper function to run a single kernel variant and return performance metrics
-std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
-                                                            std::unique_ptr<mscclpp::Fifo>& hostFifo,
-                                                            cudaStream_t stream, int numParallel) {
-  // Calculate triggers based on FIFO size
-  const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
-  const int warmupTriggers =
-      std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
-
-  // Warmup
-  kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
-  if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};  // Return error values
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Benchmark
-  utils::Timer timer;
-  timer.start();
-
-  kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
-  utils::CUDA_CHECK(cudaGetLastError());
-
-  // Process all triggers
-  if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
-    return {0.0, 0.0, 0, 0};
-  }
-  utils::CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  timer.stop();
-
-  const int totalTriggers = numTriggers * numParallel;
-  double throughput = totalTriggers / timer.elapsedSeconds();
-  double duration_us = timer.elapsedMicroseconds();
-
-  utils::CUDA_CHECK(cudaDeviceSynchronize());
-
-  return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
-}
-
-void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
-                        nlohmann::ordered_json& combinedMetrics) {
-  auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
-      runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
-
-  auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
-      runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
-
-  auto formatThroughput = [](double thru) {
-    return double(int(thru * 10)) / 10.0;  // Round to 1 decimal place
-  };
-
-  std::string prefix = "p" + std::to_string(numParallel) + "_";
-  combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
-  combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
-  combinedMetrics[prefix + "push_duration_us"] = pushDuration;
-  combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
-  combinedMetrics[prefix + "num_triggers"] = numTriggers;
-  combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
-}
-
-struct FifoTestConfig {
-  int fifoSize;
-  std::vector<int> parallelismLevels;
-
-  // Constructor with default parallelism levels
-  FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
-      : fifoSize(size), parallelismLevels(parallel) {}
-};
-
-void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
-                 [[maybe_unused]] int localRank) {
-  if (config.fifoSize <= 0) {
-    throw std::invalid_argument("FIFO size must be positive");
-  }
-  if (config.parallelismLevels.empty()) {
-    throw std::invalid_argument("At least one parallelism level must be specified");
-  }
-
-  int cudaDevice, numaNode;
-  setupCuda(cudaDevice, numaNode);
-
-  auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
-
-  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
-  utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
-
-  cudaStream_t stream;
-  utils::CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Create test name with parallelism range
-  std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
-
-  // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
-  if (!config.parallelismLevels.empty()) {
-    testName += std::to_string(config.parallelismLevels.front());
-    if (config.parallelismLevels.size() > 1) {
-      testName += "-" + std::to_string(config.parallelismLevels.back());
-
-      // If parallelism levels have non-standard steps, include more detail
-      if (config.parallelismLevels.size() > 2 &&
-          (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
-        testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
-      }
-    }
-  }
-
-  // Print test configuration
-  if (utils::isMainRank()) {
-    std::stringstream ss;
-    ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
-    for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-      if (i > 0) ss << ",";
-      ss << config.parallelismLevels[i];
-    }
-    ss << "]";
-    std::cout << ss.str() << std::endl;
-  }
-
-  nlohmann::ordered_json combinedMetrics;
-
-  for (int numParallel : config.parallelismLevels) {
-    runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
-  }
-
-  std::map<std::string, std::string> testParams;
-  testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
-
-  // Add parallelism levels to test parameters
-  std::stringstream parallelismStream;
-  for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
-    if (i > 0) parallelismStream << ",";
-    parallelismStream << config.parallelismLevels[i];
-  }
-  testParams["parallelism_levels"] = parallelismStream.str();
-
-  utils::recordResult(testName, "fifo", combinedMetrics, testParams);
-
-  utils::CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
-void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
-  // clang-format off
-  std::vector<FifoTestConfig> configs = {
-      {1, {1}},
-      {128, {1, 8, 64, 128}},
-      {512, {1, 8, 64, 256, 512}},
-  };
-  // clang-format on
-
-  for (const auto& config : configs) {
-    runFifoTest(config, rank, worldSize, localRank);
-  }
-}
-
-static void printUsage(char* argv0) {
-  std::stringstream ss;
-  ss << "Usage: " << argv0 << " [OPTIONS]\n"
-     << "\n"
-     << "Options:\n"
-     << "  -o, --output-format FORMAT   Output format: human or json (default: human)\n"
-     << "  -f, --output-file FILE       JSON output file path (default: report.jsonl)\n"
-     << "  -v, --verbose                Increase verbosity\n"
-     << "  -h, --help                   Show this help message\n";
-  std::cout << ss.str();
-}
-
-int main(int argc, char* argv[]) {
-  std::string outputFormat = "human";
-  std::string outputFile = "report.jsonl";
-  bool verbose = false;
-
-  static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
-                                        {"output-file", required_argument, 0, 'f'},
-                                        {"verbose", no_argument, 0, 'v'},
-                                        {"help", no_argument, 0, 'h'},
-                                        {0, 0, 0, 0}};
-
-  int c;
-  while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
-    switch (c) {
-      case 'o':
-        outputFormat = optarg;
-        break;
-      case 'f':
-        outputFile = optarg;
-        break;
-      case 'v':
-        verbose = true;
-        break;
-      case 'h':
-        printUsage(argv[0]);
-        return 0;
-      default:
-        printUsage(argv[0]);
-        return 1;
-    }
-  }
-
-  std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
-      {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
-
-  int result = utils::runMultipleTests(argc, argv, tests);
-
-  if (utils::isMainRank()) {
-    if (outputFormat == "json") {
-      utils::writeResultsToFile(outputFile);
-    } else {
-      utils::printResults(verbose);
-    }
-  }
-
-  utils::cleanupMPI();
-
-  return result;
-}
diff --git a/test/perf/framework.cc b/test/perf/framework.cc
deleted file mode 100644
index 85f7abd8..00000000
--- a/test/perf/framework.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "framework.hpp"
-
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace mscclpp {
-namespace test {
-
-// Global state for results
-static std::vector<TestResult> g_results;
-static int g_mpi_rank = 0;
-static int g_mpi_size = 1;
-static bool g_mpi_initialized = false;
-
-namespace utils {
-
-// Internal MPI helper functions (not exposed in header)
-void initializeMPI(int argc, char* argv[]) {
-  if (g_mpi_initialized) return;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
-  g_mpi_initialized = true;
-}
-
-static void finalizeMPI() {
-  if (!g_mpi_initialized) return;
-
-  MPI_Finalize();
-  g_mpi_initialized = false;
-}
-
-static int getMPIRank() { return g_mpi_rank; }
-
-static int getMPISize() { return g_mpi_size; }
-
-static bool isMainProcess() { return g_mpi_rank == 0; }
-
-// Public utility functions for test output
-bool isMainRank() { return g_mpi_rank == 0; }
-
-void cleanupMPI() { finalizeMPI(); }
-
-std::string getCurrentTimestamp() {
-  auto now = std::chrono::system_clock::now();
-  auto time_t = std::chrono::system_clock::to_time_t(now);
-  std::stringstream ss;
-  ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
-  return ss.str();
-}
-
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params) {
-  TestResult result;
-  result.test_name = test_name;
-  result.test_category = test_category;
-  result.test_params = test_params;
-  result.metrics = metrics;
-  result.num_processes = g_mpi_size;
-  result.process_rank = g_mpi_rank;
-  result.timestamp = getCurrentTimestamp();
-
-  g_results.push_back(result);
-}
-
-void writeResultsToFile(const std::string& filename) {
-  std::ofstream file(filename);
-  if (!file) {
-    throw std::runtime_error("Cannot open output file: " + filename);
-  }
-
-  for (const auto& result : g_results) {
-    nlohmann::ordered_json j;
-    j["test_name"] = result.test_name;
-    j["test_category"] = result.test_category;
-    j["test_config"] = result.test_params;
-    j["metrics"] = result.metrics;
-    j["num_processes"] = result.num_processes;
-    j["process_rank"] = result.process_rank;
-    j["timestamp"] = result.timestamp;
-
-    file << j.dump() << std::endl;
-  }
-}
-
-void printResults(bool verbose) {
-  if (!isMainProcess()) return;
-
-  std::cout << "\n=== Test Results ===" << std::endl;
-
-  for (const auto& result : g_results) {
-    std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
-
-    if (verbose && !result.test_params.empty()) {
-      std::cout << "  Parameters:" << std::endl;
-      for (const auto& param : result.test_params) {
-        std::cout << "    " << param.first << ": " << param.second << std::endl;
-      }
-    }
-
-    std::cout << "  Metrics:" << std::endl;
-    for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
-      std::cout << "    " << it.key() << ": " << it.value() << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
-
-// Timer implementation
-Timer::Timer() : is_running_(false) {}
-
-void Timer::start() {
-  start_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = true;
-}
-
-void Timer::stop() {
-  end_time_ = std::chrono::high_resolution_clock::now();
-  is_running_ = false;
-}
-
-double Timer::elapsedMicroseconds() const {
-  if (is_running_) {
-    auto now = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
-  }
-  return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
-}
-
-double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
-
-double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
-
-void cudaCheck(cudaError_t err, const char* file, int line) {
-  if (err != cudaSuccess) {
-    std::string msg =
-        std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
-    throw std::runtime_error(msg);
-  }
-}
-
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
-  int totalResult = 0;
-
-  // Initialize MPI once for all tests
-  initializeMPI(argc, argv);
-
-  try {
-    // Get MPI information
-    int rank = getMPIRank();
-    int size = getMPISize();
-    int local_rank = rank;  // For simplicity, assume local_rank = rank
-
-    for (const auto& test : tests) {
-      const std::string& testName = std::get<0>(test);
-      const std::string& testDescription = std::get<1>(test);
-      const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
-
-      if (rank == 0) {
-        std::cout << "Running test: " << testName << std::endl;
-        if (!testDescription.empty()) {
-          std::cout << "  " << testDescription << std::endl;
-        }
-      }
-
-      // Don't clear results - accumulate them for all tests in the same file
-      // g_results.clear();  // Commented out to accumulate results
-
-      try {
-        // Run the individual test function with MPI information
-        testFunction(rank, size, local_rank);
-
-        // Synchronize before moving to next test
-        MPI_Barrier(MPI_COMM_WORLD);
-
-      } catch (const std::exception& e) {
-        if (rank == 0) {
-          std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
-        }
-        totalResult = 1;
-      }
-    }
-
-    // Don't cleanup MPI here - let the caller handle it
-    // finalizeMPI();
-
-  } catch (const std::exception& e) {
-    if (g_mpi_rank == 0) {
-      std::cerr << "Error: " << e.what() << std::endl;
-    }
-    finalizeMPI();
-    return 1;
-  }
-
-  return totalResult;
-}
-
-}  // namespace utils
-}  // namespace test
-}  // namespace mscclpp
diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp
deleted file mode 100644
index e9b8c31f..00000000
--- a/test/perf/framework.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
-
-#include <mpi.h>
-
-#include <chrono>
-#include <fstream>
-#include <functional>
-#include <map>
-#include <mscclpp/gpu.hpp>
-#include <nlohmann/json.hpp>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace mscclpp {
-namespace test {
-
-// Test result structure
-struct TestResult {
-  std::string test_name;
-  std::string test_category;
-  std::map<std::string, std::string> test_params;
-  nlohmann::ordered_json metrics;
-  int num_processes;
-  int process_rank;
-  std::string timestamp;
-};
-
-// Simple utility functions for testing
-namespace utils {
-
-// Test execution utilities
-int runMultipleTests(
-    int argc, char* argv[],
-    const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
-
-// MPI management
-void initializeMPI(int argc, char* argv[]);
-void cleanupMPI();
-bool isMainRank();
-
-// Result recording
-void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
-                  const std::map<std::string, std::string>& test_params = {});
-
-// Output utilities
-void writeResultsToFile(const std::string& filename);
-void printResults(bool verbose = false);
-void cleanupMPI();
-
-// Timing utilities
-class Timer {
- public:
-  Timer();
-  void start();
-  void stop();
-  double elapsedMicroseconds() const;
-  double elapsedMilliseconds() const;
-  double elapsedSeconds() const;
-
- private:
-  std::chrono::high_resolution_clock::time_point start_time_;
-  std::chrono::high_resolution_clock::time_point end_time_;
-  bool is_running_;
-};
-
-// CUDA utilities
-void cudaCheck(cudaError_t err, const char* file, int line);
-#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
-
-}  // namespace utils
-
-}  // namespace test
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 312d31ef..7836e063 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,11 +1,13 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 target_sources(unit_tests PRIVATE
+    unit_tests_main.cc
     core_tests.cc
     gpu_utils_tests.cc
     errors_tests.cc
     fifo_tests.cu
+    fifo_perf_tests.cu
     numa_tests.cc
     socket_tests.cc
     utils_tests.cc
diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu
index 9db91a4f..893bb940 100644
--- a/test/unit/compile_tests.cu
+++ b/test/unit/compile_tests.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
-#include <gtest/gtest.h>
+#include "../framework.hpp"
 
 #undef NDEBUG
 #ifndef DEBUG_BUILD
diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc
index 32e6a1b5..d2552ff3 100644
--- a/test/unit/core_tests.cc
+++ b/test/unit/core_tests.cc
@@ -1,12 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 
-class LocalCommunicatorTest : public ::testing::Test {
+#include "../framework.hpp"
+
+// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
+class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
  protected:
   void SetUp() override {
     bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
@@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test {
   std::shared_ptr<mscclpp::Communicator> comm;
 };
 
-TEST_F(LocalCommunicatorTest, RegisterMemory) {
+TEST(LocalCommunicatorTest, RegisterMemory) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   EXPECT_EQ(memory.data(), &dummy);
   EXPECT_EQ(memory.size(), sizeof(dummy));
-  EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
+  ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
 }
 
-TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
+TEST(LocalCommunicatorTest, SendMemoryToSelf) {
   int dummy[42];
   auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
   comm->sendMemory(memory, 0);
@@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
   auto sameMemory = memoryFuture.get();
   EXPECT_EQ(sameMemory.data(), memory.data());
   EXPECT_EQ(sameMemory.size(), memory.size());
-  EXPECT_EQ(sameMemory.transports(), memory.transports());
+  ASSERT_TRUE(sameMemory.transports() == memory.transports());
 }
diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc
index f9faad19..3eeed387 100644
--- a/test/unit/errors_tests.cc
+++ b/test/unit/errors_tests.cc
@@ -1,30 +1,33 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 
+#include "../framework.hpp"
+
+// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
+// Using ASSERT_TRUE with manual comparisons as workaround
+
 TEST(ErrorsTest, SystemError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
 }
 
 TEST(ErrorsTest, InternalError) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
 }
 
 TEST(ErrorsTest, InvalidUsage) {
   mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
 }
 
 TEST(ErrorsTest, Timeout) {
   mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
-  EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
+  ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
   EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
 }
diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu
new file mode 100644
index 00000000..34b5d6bc
--- /dev/null
+++ b/test/unit/fifo_perf_tests.cu
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/numa.hpp>
+#include <unordered_map>
+
+#include "../framework.hpp"
+
+// Simple FIFO performance test to be run as part of unit_tests
+// This is a performance test that can be excluded from coverage runs
+// using the --exclude-perf-tests flag.
+
+constexpr uint64_t TIMEOUT_SPINS = 1000000;
+constexpr int MIN_TRIGGERS = 100;  // Reduced for faster unit test execution
+
+__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
+
+__global__ void kernelFifoPerfPush(size_t numTriggers) {
+  mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  mscclpp::ProxyTrigger trigger;
+  for (size_t i = 1; i <= numTriggers; ++i) {
+    trigger.fst = i;
+    trigger.snd = tid ^ i;
+    fifo.push(trigger);
+  }
+}
+
+static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
+  int totalTriggers = numTriggers * parallel;
+  std::unordered_map<int, int> triggerCounts;
+  for (int i = 0; i < totalTriggers; ++i) {
+    mscclpp::ProxyTrigger trigger;
+    uint64_t spin = 0;
+    do {
+      trigger = hostFifo->poll();
+      if (spin++ > TIMEOUT_SPINS) {
+        return false;
+      }
+    } while (trigger.fst == 0 || trigger.snd == 0);
+
+    trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
+    trigger.snd = trigger.snd ^ trigger.fst;
+    if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
+      return false;  // Validation failed
+    }
+    triggerCounts[trigger.snd]++;
+    hostFifo->pop();
+  }
+  return true;
+}
+
+PERF_TEST(FifoPerfTest, BasicPerformance) {
+  int cudaDevice, numaNode;
+  CUDA_CHECK(cudaGetDevice(&cudaDevice));
+  numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
+  mscclpp::numaBind(numaNode);
+
+  const int fifoSize = 128;
+  const int numTriggers = MIN_TRIGGERS;
+  const int numParallel = 1;
+
+  auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
+  mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
+  CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Run kernel
+  kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
+  CUDA_CHECK(cudaGetLastError());
+
+  // Process triggers
+  bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
+  ASSERT_TRUE(success);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index b67a220d..8d30ca5e 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/fifo.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 #define ITER 10000  // should be larger than the FIFO size for proper testing
diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc
index f4aba0d7..977314e9 100644
--- a/test/unit/gpu_utils_tests.cc
+++ b/test/unit/gpu_utils_tests.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 
+#include "../framework.hpp"
+
 TEST(GpuUtilsTest, StreamPool) {
   auto streamPool = mscclpp::gpuStreamPool();
   cudaStream_t s;
diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu
index 50ffc9ea..699baa38 100644
--- a/test/unit/local_channel_tests.cu
+++ b/test/unit/local_channel_tests.cu
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/port_channel.hpp>
 #include <mscclpp/port_channel_device.hpp>
 
+#include "../framework.hpp"
+
 #define MAGIC_CONST 777
 
 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel;
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index dfa63a74..46bf5e18 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 
+#include "../framework.hpp"
+
 TEST(NumaTest, Basic) {
   int num;
   MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index 1ab592ba..a5598938 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
 #include "socket.h"
 #include "utils_internal.hpp"
 
diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc
new file mode 100644
index 00000000..397566e0
--- /dev/null
+++ b/test/unit/unit_tests_main.cc
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "../framework.hpp"
+
+int main(int argc, char** argv) { return RUN_ALL_TESTS(); }
diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc
index 5479a681..8526d9fe 100644
--- a/test/unit/utils_internal_tests.cc
+++ b/test/unit/utils_internal_tests.cc
@@ -1,10 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include <gtest/gtest.h>
-
 #include <thread>
 
+#include "../framework.hpp"
 #include "utils_internal.hpp"
 
 TEST(UtilsInternalTest, getHostHash) {
diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc
index fa079b30..51562c21 100644
--- a/test/unit/utils_tests.cc
+++ b/test/unit/utils_tests.cc
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <gtest/gtest.h>
+// Licensed under the MIT License.
 
 #include <mscclpp/errors.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>
 
+#include "../framework.hpp"
+
 TEST(UtilsTest, getHostName) {
   std::string hostname1 = mscclpp::getHostName(1024, '.');
   EXPECT_FALSE(hostname1.empty());