From ecd33722d4e3bc108994d9517824352efbf30bfa Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 13 Apr 2026 21:51:29 -0700
Subject: [PATCH 1/4] Fix multi-node H100 CI: CUDA compat, deploy improvements
 (#781)

## Summary

- **Multi-node H100 CI setup**: Improve architecture detection and GPU
configuration
- **Remove hardcoded VMSS hostnames** from deploy files
- **Fix CUDA compat library issue**: Remove stale compat paths from
Docker image for CUDA 12+. Instead, `peer_access_test` now returns a
distinct exit code (2) for CUDA init failure, and `setup.sh`
conditionally adds compat libs only when needed. This fixes
`cudaErrorSystemNotReady` (error 803) when the host driver is newer than
the container's compat libs.
- **Speed up deploy**: Replace recursive `parallel-scp` with
tar+scp+untar to avoid per-file SSH overhead.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/multi-nodes-test.yml         | 73 +++++++++-----
 .../templates/run-remote-task.yml             |  4 +
 docker/build.sh                               |  5 -
 src/core/registered_memory.cc                 | 22 ++++-
 test/deploy/config                            |  8 --
 test/deploy/deploy.sh                         | 24 ++++-
 test/deploy/hostfile                          |  2 -
 test/deploy/hostfile_mpi                      |  2 -
 test/deploy/perf_ndmv5.jsonl                  |  9 +-
 test/deploy/run_tests.sh                      | 96 +++++++++++--------
 test/deploy/setup.sh                          | 33 ++++++-
 tools/peer-access-test/peer_access_test.cu    | 10 +-
 12 files changed, 200 insertions(+), 88 deletions(-)
 delete mode 100644 test/deploy/config
 delete mode 100644 test/deploy/hostfile
 delete mode 100644 test/deploy/hostfile_mpi

diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index d4924879..3b3ebe1f 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -16,23 +16,24 @@ pr: none
 
 
 parameters:
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
 - name: hostEntries
   type: string
   default: |
-    10.0.0.10 mscclit-000000
-    10.0.0.11 mscclit-000001
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
 
 jobs:
 - job: MultiNodesTest
   displayName: Multi nodes test
   strategy:
     matrix:
-      cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
   pool:
-    name: mscclpp-it
+    name: mscclpp-multi-node
   container:
     image: $[ variables['containerImage'] ]
 
@@ -42,25 +43,53 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        ENTRY="${{ parameters.hostEntries }}"
-        if ! grep -qxF "$ENTRY" /etc/hosts; then
-          echo "Adding to /etc/hosts"
-          echo "$ENTRY" | sudo tee -a /etc/hosts
-        else
-          echo "Entry already exists, nothing to do."
-        fi
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch:       '90'
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMscclppTest
       displayName: Run multi-nodes mscclpp-test
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      continueOnError: true
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
@@ -68,7 +97,7 @@ jobs:
     parameters:
       name: RunMultiNodeUnitTest
       displayName: Run multi-nodes unit tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
@@ -76,7 +105,7 @@ jobs:
     parameters:
       name: RunMultiNodePythonTests
       displayName: Run multi-nodes python tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
@@ -84,12 +113,12 @@ jobs:
     parameters:
       name: RunMultiNodePythonBenchmark
       displayName: Run multi-nodes python benchmark
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
   - template: templates/stop.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
index 37b3a7d7..3ca0d98a 100644
--- a/.azure-pipelines/templates/run-remote-task.yml
+++ b/.azure-pipelines/templates/run-remote-task.yml
@@ -12,12 +12,16 @@ parameters:
 - name: workingDirectory
   type: string
   default: '$(System.DefaultWorkingDirectory)'
+- name: continueOnError
+  type: boolean
+  default: false
 
 steps:
 - task: Bash@3
   ${{ if ne(parameters.name, '') }}:
     name: ${{ parameters.name }}
   displayName: ${{ parameters.displayName }}
+  continueOnError: ${{ parameters.continueOnError }}
   inputs:
     targetType: 'inline'
     script: |
diff --git a/docker/build.sh b/docker/build.sh
index 89568e19..651a6122 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -14,11 +14,6 @@ baseImageTable=(
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
-    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
-    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
-    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
-    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 
diff --git a/src/core/registered_memory.cc b/src/core/registered_memory.cc
index cb231a0f..f464de2a 100644
--- a/src/core/registered_memory.cc
+++ b/src/core/registered_memory.cc
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
       }
     }
   } else if (transports.has(Transport::CudaIpc)) {
+    // When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
+    // try CudaIpc first and fall back to IB on failure.
     auto entry = getTransportInfo(Transport::CudaIpc);
-    auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
-    // Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
-    this->remoteMemMap = gpuIpcMem->map();
-    this->data = this->remoteMemMap.get();
+    bool hasIB = (transports & AllIBTransports).any();
+    try {
+      auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
+      this->remoteMemMap = gpuIpcMem->map();
+      this->data = this->remoteMemMap.get();
+    } catch (const BaseError& e) {
+      if (!hasIB) {
+        throw;
+      }
+      bool isSameHost = (getHostHash() == this->hostHash);
+      if (isSameHost) {
+        WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
+      } else {
+        INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
+      }
+    }
   }
   if (this->data != nullptr) {
     INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
diff --git a/test/deploy/config b/test/deploy/config
deleted file mode 100644
index 2905f752..00000000
--- a/test/deploy/config
+++ /dev/null
@@ -1,8 +0,0 @@
-Host mscclit-000000
-  Port 22345
-  IdentityFile /root/mscclpp/sshkey
-  StrictHostKeyChecking no
-Host mscclit-000001
-  Port 22345
-  IdentityFile /root/mscclpp/sshkey
-  StrictHostKeyChecking no
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index 1f1d0e52..e6f1259c 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -33,12 +33,34 @@ done
 
 set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
+tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
+parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
+parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+  "sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
+rm -f /tmp/mscclpp.tar.gz
 
 if [ "${PLATFORM}" == "rocm" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
 fi
 
+# Install GDRCopy kernel module on host VMs (CUDA only)
+GDRCOPY_VERSION="2.5.2"
+if [ "${PLATFORM}" == "cuda" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+    "if lsmod | grep -q gdrdrv; then
+      echo 'gdrdrv module already loaded'
+    else
+      set -e
+      sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
+      cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
+      tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
+      CUDA=/usr/local/cuda ./build-deb-packages.sh
+      sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
+      sudo modprobe gdrdrv
+      rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
+    fi"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"
diff --git a/test/deploy/hostfile b/test/deploy/hostfile
deleted file mode 100644
index b1bfc1df..00000000
--- a/test/deploy/hostfile
+++ /dev/null
@@ -1,2 +0,0 @@
-azureuser@mscclit-000000
-azureuser@mscclit-000001
diff --git a/test/deploy/hostfile_mpi b/test/deploy/hostfile_mpi
deleted file mode 100644
index ac2514da..00000000
--- a/test/deploy/hostfile_mpi
+++ /dev/null
@@ -1,2 +0,0 @@
-mscclit-000000
-mscclit-000001
diff --git a/test/deploy/perf_ndmv5.jsonl b/test/deploy/perf_ndmv5.jsonl
index 042c6822..df36de78 100644
--- a/test/deploy/perf_ndmv5.jsonl
+++ b/test/deploy/perf_ndmv5.jsonl
@@ -1,3 +1,10 @@
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.98,  "busBw":6.96,   "size":24576,      "time":6.18,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":7.42,  "busBw":12.99,  "size":49152,      "time":6.62,    "target":"latency"}
-{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
\ No newline at end of file
+{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
+{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
+{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":0.54,  "busBw":1.01,   "size":8192,       "time":15.10,   "target":"latency"}
+{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8,  "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":118.49,"busBw":222.17, "size":25165824,   "time":212.39,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":138.48,"busBw":259.65, "size":50331648,   "time":363.40,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
+{"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8,  "algBw":96.94, "busBw":90.88,  "size":1073741824, "time":11076.24,"target":"throughput"}
\ No newline at end of file
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index 0c05a090..6a70c76e 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,83 +1,99 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
+HEAD_HOST=$(head -1 ${HOSTFILE})
+# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
+HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
+if [ -z "${HEAD_IP}" ]; then
+    HEAD_IP=${HEAD_HOST}
+fi
+MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
+MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
+
+# Select perf baseline based on GPU type
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
+if echo "${GPU_NAME}" | grep -qi "H100"; then
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
+else
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
+fi
 
 function run_mscclpp_test()
 {
   echo "=================Run allgather_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   # For kernel 2, the message size must can be divided by 3
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
   echo "==================Run allreduce_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
 
   echo "==================Run alltoall_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   echo "========================Run performance check==============================="
   python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
-    --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
+    --baseline-file ${PERF_BASELINE}
 }
 
 function run_mp_ut()
 {
   echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
-  mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 2 \
+  ${MSCCLPP_ENV} \
+  -npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 
   echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
+  -npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 }
 
 function run_pytests()
 {
   echo "==================Run python tests================================"
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
   -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
 }
 
 function run_py_benchmark()
 {
   echo "==================Run python benchmark================================"
-  mpirun -allow-run-as-root -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
+  mpirun ${MPI_ARGS} -np 16 \
+  ${MSCCLPP_ENV} \
+  -mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
   -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
   -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
+  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
 }
 
 if [ $# -lt 1 ]; then
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index d4996cc2..bc29efd8 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
-mv /root/mscclpp/test/deploy/config /root/.ssh/config
-chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey
 
+# Generate SSH config from hostfile_mpi
+HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
+if [ -f "${HOSTFILE_MPI}" ]; then
+    > /root/.ssh/config
+    while IFS= read -r host; do
+        echo "Host ${host}" >> /root/.ssh/config
+        echo "  Port 22345" >> /root/.ssh/config
+        echo "  IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
+        echo "  StrictHostKeyChecking no" >> /root/.ssh/config
+    done < "${HOSTFILE_MPI}"
+    chown root:root /root/.ssh/config
+fi
+
 if [ "${PLATFORM}" == "cuda" ]; then
     nvidia-smi -pm 1
     for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
@@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then
 fi
 
 make -C /root/mscclpp/tools/peer-access-test
+set +e
 /root/mscclpp/tools/peer-access-test/peer_access_test
+PEER_ACCESS_EXIT_CODE=$?
+set -e
+if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
+    # Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
+    # Add CUDA compat libs for forward compatibility and retry.
+    CUDA_COMPAT_PATH="/usr/local/cuda/compat"
+    if [ -d "${CUDA_COMPAT_PATH}" ]; then
+        echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
+        export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
+        /root/mscclpp/tools/peer-access-test/peer_access_test
+    else
+        echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
+        exit 1
+    fi
+elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
+    exit ${PEER_ACCESS_EXIT_CODE}
+fi
 make -C /root/mscclpp/tools/peer-access-test clean
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
diff --git a/tools/peer-access-test/peer_access_test.cu b/tools/peer-access-test/peer_access_test.cu
index 428ed1ac..03cb27a6 100644
--- a/tools/peer-access-test/peer_access_test.cu
+++ b/tools/peer-access-test/peer_access_test.cu
@@ -13,6 +13,10 @@ constexpr auto cudaSuccess = hipSuccess;
 
 #include <iostream>
 
+// Exit code 2 indicates CUDA initialization failure (e.g., driver/toolkit mismatch).
+// This allows callers to distinguish it from other failures and retry with compat libs.
+constexpr int EXIT_CUDA_INIT_FAILURE = 2;
+
 #define CUDACHECK(cmd)                                                \
   do {                                                                \
     cudaError_t e = cmd;                                              \
@@ -25,7 +29,11 @@ constexpr auto cudaSuccess = hipSuccess;
 int main() {
   bool canAccessPeerAll = true;
   int devCount = 0;
-  CUDACHECK(cudaGetDeviceCount(&devCount));
+  cudaError_t err = cudaGetDeviceCount(&devCount);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed: cudaGetDeviceCount(&devCount) returned " << err << std::endl;
+    return EXIT_CUDA_INIT_FAILURE;
+  }
   std::cout << "Detected " << devCount << " device(s)" << std::endl;
   if (devCount >= 2) {
     for (int i = 0; i < devCount; ++i) {

From 572028ea3d9671a65ea9e9b8c65b0879328fae60 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 15 Apr 2026 12:55:40 -0700
Subject: [PATCH 2/4] Fix nccl-test CI building for all GPU architectures
 (#786)

## Problem

`nccl-test.yml` was the only CI template calling `deploy.yml` without
passing `gpuArch`. Since the CI build machine has no GPU, CMake fell
back to building for **all** supported architectures (`80;90;100;120`),
unnecessarily slowing down CI builds.

## Fix

- Add `gpuArch` parameter to `nccl-test.yml` and forward it to
`deploy.yml`
- Pass `gpuArch: '80'` (A100) and `gpuArch: '90'` (H100) from
`nccl-api-test.yml`

All other templates were already passing `gpuArch` correctly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .azure-pipelines/nccl-api-test.yml       | 2 ++
 .azure-pipelines/templates/nccl-test.yml | 4 ++++
 test/deploy/deploy.sh                    | 4 ----
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.azure-pipelines/nccl-api-test.yml b/.azure-pipelines/nccl-api-test.yml
index cc017412..85b466ef 100644
--- a/.azure-pipelines/nccl-api-test.yml
+++ b/.azure-pipelines/nccl-api-test.yml
@@ -44,6 +44,7 @@ jobs:
     parameters:
       subscription:     mscclpp-ci
       vmssName:         mscclpp-ci
+      gpuArch:          '80'
       nvccGencode:      "-gencode=arch=compute_80,code=sm_80"
 
 - job: NcclTestH100
@@ -64,4 +65,5 @@ jobs:
     parameters:
       subscription:     mscclpp-ci-h100
       vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
       nvccGencode:      "-gencode=arch=compute_90,code=sm_90"
\ No newline at end of file
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
index 211e2393..fa3900f1 100644
--- a/.azure-pipelines/templates/nccl-test.yml
+++ b/.azure-pipelines/templates/nccl-test.yml
@@ -10,6 +10,9 @@ parameters:
   type: string
 - name: vmssName
   type: string
+- name: gpuArch
+  type: string
+  default: '80'
 - name: nvccGencode
   type: string
   default: "-gencode=arch=compute_80,code=sm_80"
@@ -19,6 +22,7 @@ steps:
   parameters:
     subscription:     ${{ parameters.subscription }}
     vmssName:         ${{ parameters.vmssName }}
+    gpuArch:          ${{ parameters.gpuArch }}
     deployArgs:       'nccltest-single-node'
 
 - template: run-remote-task.yml
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index e6f1259c..6358787b 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -6,10 +6,6 @@ PLATFORM="${3:-cuda}"
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
-if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
-  ROOT_DIR="${ROOT_DIR}/mscclpp"
-  SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
-fi
 DST_DIR="/tmp/mscclpp"
 if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
   HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"

From eeea00b298e0674a48d498c3ca695d8e85f72dae Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Thu, 16 Apr 2026 21:24:45 -0700
Subject: [PATCH 3/4] Support python wheel build (#787)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Support Python wheel build

This PR modernizes the Python packaging for MSCCL++ by defining
dependencies and optional extras in `pyproject.toml`, enabling proper
wheel builds with `pip install ".[cuda12]"`.

### Changes

**`pyproject.toml`**
- Add `dependencies` (numpy, blake3, pybind11, sortedcontainers)
- Add `optional-dependencies` for platform-specific CuPy (`cuda11`,
`cuda12`, `cuda13`, `rocm6`), `benchmark`, and `test` extras
- Bump minimum Python version from 3.8 to 3.10

**`test/deploy/setup.sh`**
- Use `pip install ".[<platform>,benchmark,test]"` instead of separate
`pip install -r requirements_*.txt` + `pip install .` steps
- Add missing CUDA 13 case

**`docs/quickstart.md`**
- Update install instructions to use extras (e.g., `pip install
".[cuda12]"`)
- Document all available extras and clarify that `rocm6` builds CuPy
from source
- Update Python version references to 3.10

**`python/csrc/CMakeLists.txt`**, **`python/test/CMakeLists.txt`**
- Update `find_package(Python)` from 3.8 to 3.10

### Notes
- The `requirements_*.txt` files are kept for Docker base image builds
where only dependencies (not the project itself) should be installed.
- CuPy is intentionally not in base dependencies — users must specify a
platform extra to get the correct pre-built wheel (or source build for
ROCm).

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/quickstart.md               | 36 ++++++++++++++++++++++++--------
 pyproject.toml                   | 27 ++++++++++++++++++++++--
 python/csrc/CMakeLists.txt       |  2 +-
 python/mscclpp/_core/compiler.py |  8 +++++++
 python/requirements_cuda11.txt   |  2 +-
 python/requirements_cuda12.txt   |  2 +-
 python/requirements_cuda13.txt   |  2 +-
 python/requirements_rocm6.txt    |  2 +-
 python/test/CMakeLists.txt       |  2 +-
 test/deploy/setup.sh             | 20 +++++++++++-------
 10 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/docs/quickstart.md b/docs/quickstart.md
index c9c98128..83a08d6a 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -25,9 +25,9 @@
         ```bash
         sudo apt-get install libnuma-dev
         ```
-    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
+    * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package
         ```bash
-        sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
+        sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)"
         ```
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
@@ -100,13 +100,30 @@ There are a few optional CMake options you can set:
 (install-from-source-python-module)=
 ## Install from Source (Python Module)
 
-Python 3.8 or later is required.
+Python 3.10 or later is required.
 
 ```bash
-# For NVIDIA platforms
-$ python -m pip install .
-# For AMD platforms, set the C++ compiler to HIPCC
-$ CXX=/opt/rocm/bin/hipcc python -m pip install .
+# For NVIDIA platforms (specify your CUDA version)
+$ python -m pip install ".[cuda12]"
+# For AMD platforms
+$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
+```
+
+> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
+> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
+> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
+
+Optional extras can be installed by specifying them in brackets. Available extras:
+- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
+- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
+- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
+- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
+
+```bash
+# Example: install with CUDA 12 and benchmark extras
+$ python -m pip install ".[cuda12,benchmark]"
+# Example: install with all extras for testing on CUDA 12
+$ python -m pip install ".[cuda12,benchmark,test]"
 ```
 
 (vscode-dev-container)=
@@ -158,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
 [Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
 
 ```bash
-# Choose `requirements_*.txt` according to your CUDA/ROCm version.
-$ python3 -m pip install -r ./python/requirements_cuda12.txt
+# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
+# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
+$ python3 -m pip install ".[cuda12,benchmark,test]"
 $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 651fec3b..0ea569cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build"
 name = "mscclpp"
 dynamic = ["version"]
 description = "MSCCL++ Python API"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
+dependencies = [
+    "numpy",
+    "blake3",
+    "pybind11",
+    "sortedcontainers",
+]
+
+[project.optional-dependencies]
+cuda11 = ["cupy-cuda11x"]
+cuda12 = ["cupy-cuda12x"]
+cuda13 = ["cupy-cuda13x"]
+rocm6 = ["cupy"]
+benchmark = [
+    "mpi4py",
+    "prettytable",
+    "netifaces",
+    "matplotlib",
+]
+test = [
+    "pytest",
+    "mpi4py",
+    "netifaces",
+]
 
 [tool.setuptools_scm]
 write_to = "python/mscclpp/_version.py"
@@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF"
 
 [tool.black]
 line-length = 120
-target-version = ['py38']
+target-version = ['py310']
 include = '\.pyi?$'
diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt
index 44fb150f..7c7bf3b9 100644
--- a/python/csrc/CMakeLists.txt
+++ b/python/csrc/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
 include(FetchContent)
 FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
 FetchContent_MakeAvailable(nanobind)
diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py
index b2da976d..3b77ce8e 100644
--- a/python/mscclpp/_core/compiler.py
+++ b/python/mscclpp/_core/compiler.py
@@ -192,6 +192,9 @@ class NativeCodeCompiler:
     """
 
     def __init__(self):
+        self._initialized = False
+
+    def _do_init(self):
         self._is_hip = cp.cuda.runtime.is_hip
         self._device_arch = get_device_arch()
         self._compiler = self._get_compiler()
@@ -226,6 +229,7 @@ class NativeCodeCompiler:
         ]
         self._cache_dir = Path(env().cache_dir) / "native"
         self._cache_dir.mkdir(parents=True, exist_ok=True)
+        self._initialized = True
 
     def _get_compiler(self) -> str:
         """Get the path to the appropriate compiler.
@@ -246,6 +250,8 @@ class NativeCodeCompiler:
         Returns:
             str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
         """
+        if not self._initialized:
+            self._do_init()
         return self._device_arch
 
     def __call__(self, name: str, file: str, **kwds):
@@ -290,6 +296,8 @@ class NativeCodeCompiler:
             >>> # Use the module to create an algorithm
             >>> algo = module.create_allreduce_algorithm(comm, buffer, size)
         """
+        if not self._initialized:
+            self._do_init()
         if not os.path.isfile(file):
             raise FileNotFoundError(f"The specified source file does not exist: {file}")
 
diff --git a/python/requirements_cuda11.txt b/python/requirements_cuda11.txt
index 4e2e9371..a9786071 100644
--- a/python/requirements_cuda11.txt
+++ b/python/requirements_cuda11.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_cuda12.txt b/python/requirements_cuda12.txt
index e1c9b726..71572714 100644
--- a/python/requirements_cuda12.txt
+++ b/python/requirements_cuda12.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt
index 49cf13bc..95e99533 100644
--- a/python/requirements_cuda13.txt
+++ b/python/requirements_cuda13.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
index 7ed4fef3..757d4e26 100644
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -5,6 +5,6 @@ netifaces
 pytest
 numpy
 matplotlib
-sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+sortedcontainers
 blake3
 pybind11
\ No newline at end of file
diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt
index be62aea9..e55711d2 100644
--- a/python/test/CMakeLists.txt
+++ b/python/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
 include(FetchContent)
 FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
 FetchContent_MakeAvailable(nanobind)
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index bc29efd8..2a88a310 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -50,12 +50,6 @@ elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
 fi
 make -C /root/mscclpp/tools/peer-access-test clean
 
-if [[ "${CUDA_VERSION}" == *"11."* ]]; then
-    pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
-elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
-    pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
-fi
-
 if [ "${PLATFORM}" == "rocm" ]; then
     export CXX=/opt/rocm/bin/hipcc
 fi
@@ -65,7 +59,19 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
     export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
     echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
 fi
-cd /root/mscclpp && pip3 install .
+
+cd /root/mscclpp
+if [[ "${CUDA_VERSION}" == *"11."* ]]; then
+    pip3 install ".[cuda11,benchmark,test]"
+elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
+    pip3 install ".[cuda12,benchmark,test]"
+elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
+    pip3 install ".[cuda13,benchmark,test]"
+elif [ "${PLATFORM}" == "rocm" ]; then
+    pip3 install ".[rocm6,benchmark,test]"
+else
+    pip3 install ".[benchmark,test]"
+fi
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files
 

From e874bf16663a30cd50c168d244211a4ab9c5b97c Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Apr 2026 10:12:40 -0700
Subject: [PATCH 4/4] fix: isCuMemMapAllocated crashes on non-NVLS systems even
 with MSCCLPP_FORCE_DISABLE_NVLS=true (#790)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- [x] Fix `isCuMemMapAllocated()` to just return `true/false` without
throwing when NVLS is not supported
- [x] Fix `isNvlsSupported()` caching bug where `result`/`isChecked`
were never updated
- [x] Restore `[[maybe_unused]]` on `result` and `isChecked` statics —
needed in HIP/ROCm env where `CUDA_NVLS_API_AVAILABLE` is not defined
and the variables would otherwise be unused
- [x] Run linter (`./tools/lint.sh`)

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
---
 src/core/gpu_utils.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 628d2dcb..09d5025d 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -283,7 +283,9 @@ bool isNvlsSupported() {
     MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
     MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
     MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    return isMulticastSupported == 1;
+    result = (isMulticastSupported == 1);
+    isChecked = true;
+    return result;
   }
   return result;
 #endif
@@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
     return false;
   }
   MSCCLPP_CUTHROW(cuMemRelease(handle));
-  if (!isNvlsSupported()) {
-    throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
-  }
   return true;
 #endif
 }