From ecd33722d4e3bc108994d9517824352efbf30bfa Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 13 Apr 2026 21:51:29 -0700 Subject: [PATCH 1/4] Fix multi-node H100 CI: CUDA compat, deploy improvements (#781) ## Summary - **Multi-node H100 CI setup**: Improve architecture detection and GPU configuration - **Remove hardcoded VMSS hostnames** from deploy files - **Fix CUDA compat library issue**: Remove stale compat paths from Docker image for CUDA 12+. Instead, `peer_access_test` now returns a distinct exit code (2) for CUDA init failure, and `setup.sh` conditionally adds compat libs only when needed. This fixes `cudaErrorSystemNotReady` (error 803) when the host driver is newer than the container's compat libs. - **Speed up deploy**: Replace recursive `parallel-scp` with tar+scp+untar to avoid per-file SSH overhead. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .azure-pipelines/multi-nodes-test.yml | 73 +++++++++----- .../templates/run-remote-task.yml | 4 + docker/build.sh | 5 - src/core/registered_memory.cc | 22 ++++- test/deploy/config | 8 -- test/deploy/deploy.sh | 24 ++++- test/deploy/hostfile | 2 - test/deploy/hostfile_mpi | 2 - test/deploy/perf_ndmv5.jsonl | 9 +- test/deploy/run_tests.sh | 96 +++++++++++-------- test/deploy/setup.sh | 33 ++++++- tools/peer-access-test/peer_access_test.cu | 10 +- 12 files changed, 200 insertions(+), 88 deletions(-) delete mode 100644 test/deploy/config delete mode 100644 test/deploy/hostfile delete mode 100644 test/deploy/hostfile_mpi diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index d4924879..3b3ebe1f 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -16,23 +16,24 @@ pr: none parameters: +- name: vmssName + type: string + default: mscclpp-h100-multinode-ci - name: hostEntries type: string default: | - 10.0.0.10 mscclit-000000 - 10.0.0.11 mscclit-000001 + 10.0.0.5 mscclpp-h100-multinode-ci000000 + 10.0.0.4 mscclpp-h100-multinode-ci000001 jobs: - job: MultiNodesTest displayName: Multi nodes test strategy: matrix: - cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 pool: - name: mscclpp-it + name: mscclpp-multi-node container: image: $[ variables['containerImage'] ] @@ -42,25 +43,53 @@ jobs: inputs: targetType: 'inline' script: | - ENTRY="${{ parameters.hostEntries }}" - if ! grep -qxF "$ENTRY" /etc/hosts; then - echo "Adding to /etc/hosts" - echo "$ENTRY" | sudo tee -a /etc/hosts - else - echo "Entry already exists, nothing to do." - fi + while IFS= read -r line; do + [ -z "$line" ] && continue + if ! grep -qxF "$line" /etc/hosts; then + echo "Adding to /etc/hosts: $line" + echo "$line" | sudo tee -a /etc/hosts + else + echo "Entry already exists: $line" + fi + done <<< "${{ parameters.hostEntries }}" + + - task: Bash@3 + displayName: Generate deploy files + inputs: + targetType: 'inline' + script: | + set -e + VMSS="${{ parameters.vmssName }}" + DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy" + NODE0="${VMSS}000000" + NODE1="${VMSS}000001" + + echo "Host ${NODE0} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no + Host ${NODE1} + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no" > "${DEPLOY_DIR}/config" + + printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile" + + printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi" - template: templates/deploy.yml parameters: - subscription: msccl-it - vmssName: mscclit-vmss - resourceGroup: msccl-IT + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp + gpuArch: '90' - template: templates/run-remote-task.yml parameters: name: RunMscclppTest displayName: Run multi-nodes mscclpp-test - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + continueOnError: true + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test @@ -68,7 +97,7 @@ jobs: parameters: name: RunMultiNodeUnitTest displayName: Run multi-nodes unit tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | bash /root/mscclpp/test/deploy/run_tests.sh mp-ut @@ -76,7 +105,7 @@ jobs: parameters: name: RunMultiNodePythonTests displayName: Run multi-nodes python tests - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | bash /root/mscclpp/test/deploy/run_tests.sh pytests @@ -84,12 +113,12 @@ jobs: parameters: name: RunMultiNodePythonBenchmark displayName: Run multi-nodes python benchmark - runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser' remoteScript: | bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - template: templates/stop.yml parameters: - subscription: msccl-it - vmssName: mscclit-vmss - resourceGroup: msccl-IT + subscription: mscclpp-ci-h100 + vmssName: ${{ parameters.vmssName }} + resourceGroup: mscclpp diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml index 37b3a7d7..3ca0d98a 100644 --- a/.azure-pipelines/templates/run-remote-task.yml +++ b/.azure-pipelines/templates/run-remote-task.yml @@ -12,12 +12,16 @@ parameters: - name: workingDirectory type: string default: '$(System.DefaultWorkingDirectory)' +- name: continueOnError + type: boolean + default: false steps: - task: Bash@3 ${{ if ne(parameters.name, '') }}: name: ${{ parameters.name }} displayName: ${{ parameters.displayName }} + continueOnError: ${{ parameters.continueOnError }} inputs: targetType: 'inline' script: | diff --git a/docker/build.sh b/docker/build.sh index 89568e19..651a6122 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -14,11 +14,6 @@ baseImageTable=( declare -A extraLdPathTable extraLdPathTable=( - ["cuda11.8"]="/usr/local/cuda-11.8/compat" - ["cuda12.4"]="/usr/local/cuda-12.4/compat" - ["cuda12.8"]="/usr/local/cuda-12.8/compat" - ["cuda12.9"]="/usr/local/cuda-12.9/compat" - ["cuda13.0"]="/usr/local/cuda-13.0/compat" ["rocm6.2"]="/opt/rocm/lib" ) diff --git a/src/core/registered_memory.cc b/src/core/registered_memory.cc index cb231a0f..f464de2a 100644 --- a/src/core/registered_memory.cc +++ b/src/core/registered_memory.cc @@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector::const_iterator& begin, } } } else if (transports.has(Transport::CudaIpc)) { + // When transports include both CudaIpc and IB (e.g., CudaIpc | IB0), + // try CudaIpc first and fall back to IB on failure. auto entry = getTransportInfo(Transport::CudaIpc); - auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle); - // Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive. - this->remoteMemMap = gpuIpcMem->map(); - this->data = this->remoteMemMap.get(); + bool hasIB = (transports & AllIBTransports).any(); + try { + auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle); + this->remoteMemMap = gpuIpcMem->map(); + this->data = this->remoteMemMap.get(); + } catch (const BaseError& e) { + if (!hasIB) { + throw; + } + bool isSameHost = (getHostHash() == this->hostHash); + if (isSameHost) { + WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what()); + } else { + INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what()); + } + } } if (this->data != nullptr) { INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data); diff --git a/test/deploy/config b/test/deploy/config deleted file mode 100644 index 2905f752..00000000 --- a/test/deploy/config +++ /dev/null @@ -1,8 +0,0 @@ -Host mscclit-000000 - Port 22345 - IdentityFile /root/mscclpp/sshkey - StrictHostKeyChecking no -Host mscclit-000001 - Port 22345 - IdentityFile /root/mscclpp/sshkey - StrictHostKeyChecking no diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index 1f1d0e52..e6f1259c 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -33,12 +33,34 @@ done set -e parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}" -parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} +tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} . +parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz" +rm -f /tmp/mscclpp.tar.gz if [ "${PLATFORM}" == "rocm" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu" fi +# Install GDRCopy kernel module on host VMs (CUDA only) +GDRCOPY_VERSION="2.5.2" +if [ "${PLATFORM}" == "cuda" ]; then + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "if lsmod | grep -q gdrdrv; then + echo 'gdrdrv module already loaded' + else + set -e + sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms + cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz + tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages + CUDA=/usr/local/cuda ./build-deb-packages.sh + sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb + sudo modprobe gdrdrv + rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION} + fi" +fi + # force to pull the latest image parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker pull ${CONTAINERIMAGE}" diff --git a/test/deploy/hostfile b/test/deploy/hostfile deleted file mode 100644 index b1bfc1df..00000000 --- a/test/deploy/hostfile +++ /dev/null @@ -1,2 +0,0 @@ -azureuser@mscclit-000000 -azureuser@mscclit-000001 diff --git a/test/deploy/hostfile_mpi b/test/deploy/hostfile_mpi deleted file mode 100644 index ac2514da..00000000 --- a/test/deploy/hostfile_mpi +++ /dev/null @@ -1,2 +0,0 @@ -mscclit-000000 -mscclit-000001 diff --git a/test/deploy/perf_ndmv5.jsonl b/test/deploy/perf_ndmv5.jsonl index 042c6822..df36de78 100644 --- a/test/deploy/perf_ndmv5.jsonl +++ b/test/deploy/perf_ndmv5.jsonl @@ -1,3 +1,10 @@ {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"} {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"} -{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"} \ No newline at end of file +{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"} +{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"} +{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.54, "busBw":1.01, "size":8192, "time":15.10, "target":"latency"} +{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":118.49,"busBw":222.17, "size":25165824, "time":212.39, "target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":138.48,"busBw":259.65, "size":50331648, "time":363.40, "target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"} +{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":96.94, "busBw":90.88, "size":1073741824, "time":11076.24,"target":"throughput"} \ No newline at end of file diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index 0c05a090..6a70c76e 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -1,83 +1,99 @@ set -e HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi +HEAD_HOST=$(head -1 ${HOSTFILE}) +# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface +HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null) +if [ -z "${HEAD_IP}" ]; then + HEAD_IP=${HEAD_HOST} +fi +MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0" +MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH" + +# Select perf baseline based on GPU type +GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1) +if echo "${GPU_NAME}" | grep -qi "H100"; then + PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl +else + PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl +fi function run_mscclpp_test() { echo "=================Run allgather_test_perf on 2 nodes=========================" - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl # For kernel 2, the message size must can be divided by 3 - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl echo "==================Run allreduce_test_perf on 2 nodes=========================" - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl echo "==================Run alltoall_test_perf on 2 nodes=========================" - mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ - -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl echo "========================Run performance check===============================" python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \ - --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl + --baseline-file ${PERF_BASELINE} } function run_mp_ut() { echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)=========================" - mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \ - -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 + mpirun ${MPI_ARGS} -tag-output -np 2 \ + ${MSCCLPP_ENV} \ + -npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003 echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)=========================" - mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ - -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 + mpirun ${MPI_ARGS} -tag-output -np 16 \ + ${MSCCLPP_ENV} \ + -npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003 } function run_pytests() { echo "==================Run python tests================================" - mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ - -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ + mpirun ${MPI_ARGS} -tag-output -np 16 \ + ${MSCCLPP_ENV} \ -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh } function run_py_benchmark() { echo "==================Run python benchmark================================" - mpirun -allow-run-as-root -np 16 --bind-to numa \ - -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \ - -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \ + mpirun ${MPI_ARGS} -np 16 \ + ${MSCCLPP_ENV} \ + -mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \ -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \ -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \ - -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py + -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py } if [ $# -lt 1 ]; then diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index d4996cc2..bc29efd8 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}" mkdir -p /root/.ssh mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys chown root:root /root/.ssh/authorized_keys -mv /root/mscclpp/test/deploy/config /root/.ssh/config -chown root:root /root/.ssh/config chmod 400 /root/mscclpp/sshkey chown root:root /root/mscclpp/sshkey +# Generate SSH config from hostfile_mpi +HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi +if [ -f "${HOSTFILE_MPI}" ]; then + > /root/.ssh/config + while IFS= read -r host; do + echo "Host ${host}" >> /root/.ssh/config + echo " Port 22345" >> /root/.ssh/config + echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config + echo " StrictHostKeyChecking no" >> /root/.ssh/config + done < "${HOSTFILE_MPI}" + chown root:root /root/.ssh/config +fi + if [ "${PLATFORM}" == "cuda" ]; then nvidia-smi -pm 1 for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do @@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then fi make -C /root/mscclpp/tools/peer-access-test +set +e /root/mscclpp/tools/peer-access-test/peer_access_test +PEER_ACCESS_EXIT_CODE=$? +set -e +if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then + # Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch). + # Add CUDA compat libs for forward compatibility and retry. + CUDA_COMPAT_PATH="/usr/local/cuda/compat" + if [ -d "${CUDA_COMPAT_PATH}" ]; then + echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility" + export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}" + /root/mscclpp/tools/peer-access-test/peer_access_test + else + echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}" + exit 1 + fi +elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then + exit ${PEER_ACCESS_EXIT_CODE} +fi make -C /root/mscclpp/tools/peer-access-test clean if [[ "${CUDA_VERSION}" == *"11."* ]]; then diff --git a/tools/peer-access-test/peer_access_test.cu b/tools/peer-access-test/peer_access_test.cu index 428ed1ac..03cb27a6 100644 --- a/tools/peer-access-test/peer_access_test.cu +++ b/tools/peer-access-test/peer_access_test.cu @@ -13,6 +13,10 @@ constexpr auto cudaSuccess = hipSuccess; #include +// Exit code 2 indicates CUDA initialization failure (e.g., driver/toolkit mismatch). +// This allows callers to distinguish it from other failures and retry with compat libs. +constexpr int EXIT_CUDA_INIT_FAILURE = 2; + #define CUDACHECK(cmd) \ do { \ cudaError_t e = cmd; \ @@ -25,7 +29,11 @@ constexpr auto cudaSuccess = hipSuccess; int main() { bool canAccessPeerAll = true; int devCount = 0; - CUDACHECK(cudaGetDeviceCount(&devCount)); + cudaError_t err = cudaGetDeviceCount(&devCount); + if (err != cudaSuccess) { + std::cerr << "Failed: cudaGetDeviceCount(&devCount) returned " << err << std::endl; + return EXIT_CUDA_INIT_FAILURE; + } std::cout << "Detected " << devCount << " device(s)" << std::endl; if (devCount >= 2) { for (int i = 0; i < devCount; ++i) { From 572028ea3d9671a65ea9e9b8c65b0879328fae60 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 15 Apr 2026 12:55:40 -0700 Subject: [PATCH 2/4] Fix nccl-test CI building for all GPU architectures (#786) ## Problem `nccl-test.yml` was the only CI template calling `deploy.yml` without passing `gpuArch`. Since the CI build machine has no GPU, CMake fell back to building for **all** supported architectures (`80;90;100;120`), unnecessarily slowing down CI builds. ## Fix - Add `gpuArch` parameter to `nccl-test.yml` and forward it to `deploy.yml` - Pass `gpuArch: '80'` (A100) and `gpuArch: '90'` (H100) from `nccl-api-test.yml` All other templates were already passing `gpuArch` correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .azure-pipelines/nccl-api-test.yml | 2 ++ .azure-pipelines/templates/nccl-test.yml | 4 ++++ test/deploy/deploy.sh | 4 ---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.azure-pipelines/nccl-api-test.yml b/.azure-pipelines/nccl-api-test.yml index cc017412..85b466ef 100644 --- a/.azure-pipelines/nccl-api-test.yml +++ b/.azure-pipelines/nccl-api-test.yml @@ -44,6 +44,7 @@ jobs: parameters: subscription: mscclpp-ci vmssName: mscclpp-ci + gpuArch: '80' nvccGencode: "-gencode=arch=compute_80,code=sm_80" - job: NcclTestH100 @@ -64,4 +65,5 @@ jobs: parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci + gpuArch: '90' nvccGencode: "-gencode=arch=compute_90,code=sm_90" \ No newline at end of file diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml index 211e2393..fa3900f1 100644 --- a/.azure-pipelines/templates/nccl-test.yml +++ b/.azure-pipelines/templates/nccl-test.yml @@ -10,6 +10,9 @@ parameters: type: string - name: vmssName type: string +- name: gpuArch + type: string + default: '80' - name: nvccGencode type: string default: "-gencode=arch=compute_80,code=sm_80" @@ -19,6 +22,7 @@ steps: parameters: subscription: ${{ parameters.subscription }} vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} deployArgs: 'nccltest-single-node' - template: run-remote-task.yml diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index e6f1259c..6358787b 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -6,10 +6,6 @@ PLATFORM="${3:-cuda}" KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" -if [ "${TEST_NAME}" == "nccltest-single-node" ]; then - ROOT_DIR="${ROOT_DIR}/mscclpp" - SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp" -fi DST_DIR="/tmp/mscclpp" if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci" From eeea00b298e0674a48d498c3ca695d8e85f72dae Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 16 Apr 2026 21:24:45 -0700 Subject: [PATCH 3/4] Support python wheel build (#787) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Support Python wheel build This PR modernizes the Python packaging for MSCCL++ by defining dependencies and optional extras in `pyproject.toml`, enabling proper wheel builds with `pip install ".[cuda12]"`. ### Changes **`pyproject.toml`** - Add `dependencies` (numpy, blake3, pybind11, sortedcontainers) - Add `optional-dependencies` for platform-specific CuPy (`cuda11`, `cuda12`, `cuda13`, `rocm6`), `benchmark`, and `test` extras - Bump minimum Python version from 3.8 to 3.10 **`test/deploy/setup.sh`** - Use `pip install ".[,benchmark,test]"` instead of separate `pip install -r requirements_*.txt` + `pip install .` steps - Add missing CUDA 13 case **`docs/quickstart.md`** - Update install instructions to use extras (e.g., `pip install ".[cuda12]"`) - Document all available extras and clarify that `rocm6` builds CuPy from source - Update Python version references to 3.10 **`python/csrc/CMakeLists.txt`**, **`python/test/CMakeLists.txt`** - Update `find_package(Python)` from 3.8 to 3.10 ### Notes - The `requirements_*.txt` files are kept for Docker base image builds where only dependencies (not the project itself) should be installed. - CuPy is intentionally not in base dependencies — users must specify a platform extra to get the correct pre-built wheel (or source build for ROCm). --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/quickstart.md | 36 ++++++++++++++++++++++++-------- pyproject.toml | 27 ++++++++++++++++++++++-- python/csrc/CMakeLists.txt | 2 +- python/mscclpp/_core/compiler.py | 8 +++++++ python/requirements_cuda11.txt | 2 +- python/requirements_cuda12.txt | 2 +- python/requirements_cuda13.txt | 2 +- python/requirements_rocm6.txt | 2 +- python/test/CMakeLists.txt | 2 +- test/deploy/setup.sh | 20 +++++++++++------- 10 files changed, 79 insertions(+), 24 deletions(-) diff --git a/docs/quickstart.md b/docs/quickstart.md index c9c98128..83a08d6a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -25,9 +25,9 @@ ```bash sudo apt-get install libnuma-dev ``` - * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package + * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package ```bash - sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)" + sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)" ``` If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)). * (Optional, for benchmarks) MPI @@ -100,13 +100,30 @@ There are a few optional CMake options you can set: (install-from-source-python-module)= ## Install from Source (Python Module) -Python 3.8 or later is required. +Python 3.10 or later is required. ```bash -# For NVIDIA platforms -$ python -m pip install . -# For AMD platforms, set the C++ compiler to HIPCC -$ CXX=/opt/rocm/bin/hipcc python -m pip install . +# For NVIDIA platforms (specify your CUDA version) +$ python -m pip install ".[cuda12]" +# For AMD platforms +$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]" +``` + +> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy. +> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source, +> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy. + +Optional extras can be installed by specifying them in brackets. Available extras: +- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version. +- **`rocm6`**: Install CuPy from source for AMD ROCm platforms. +- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib). +- **`test`**: Install test dependencies (pytest, mpi4py, netifaces). + +```bash +# Example: install with CUDA 12 and benchmark extras +$ python -m pip install ".[cuda12,benchmark]" +# Example: install with all extras for testing on CUDA 12 +$ python -m pip install ".[cuda12,benchmark,test]" ``` (vscode-dev-container)= @@ -158,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0 [Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. ```bash -# Choose `requirements_*.txt` according to your CUDA/ROCm version. -$ python3 -m pip install -r ./python/requirements_cuda12.txt +# Install with benchmark dependencies and the appropriate CUDA/ROCm extras. +# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6. +$ python3 -m pip install ".[cuda12,benchmark,test]" $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py ``` diff --git a/pyproject.toml b/pyproject.toml index 651fec3b..0ea569cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build" name = "mscclpp" dynamic = ["version"] description = "MSCCL++ Python API" -requires-python = ">=3.8" +requires-python = ">=3.10" +dependencies = [ + "numpy", + "blake3", + "pybind11", + "sortedcontainers", +] + +[project.optional-dependencies] +cuda11 = ["cupy-cuda11x"] +cuda12 = ["cupy-cuda12x"] +cuda13 = ["cupy-cuda13x"] +rocm6 = ["cupy"] +benchmark = [ + "mpi4py", + "prettytable", + "netifaces", + "matplotlib", +] +test = [ + "pytest", + "mpi4py", + "netifaces", +] [tool.setuptools_scm] write_to = "python/mscclpp/_version.py" @@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF" [tool.black] line-length = 120 -target-version = ['py38'] +target-version = ['py310'] include = '\.pyi?$' diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt index 44fb150f..7c7bf3b9 100644 --- a/python/csrc/CMakeLists.txt +++ b/python/csrc/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) +find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED) include(FetchContent) FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2) FetchContent_MakeAvailable(nanobind) diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py index b2da976d..3b77ce8e 100644 --- a/python/mscclpp/_core/compiler.py +++ b/python/mscclpp/_core/compiler.py @@ -192,6 +192,9 @@ class NativeCodeCompiler: """ def __init__(self): + self._initialized = False + + def _do_init(self): self._is_hip = cp.cuda.runtime.is_hip self._device_arch = get_device_arch() self._compiler = self._get_compiler() @@ -226,6 +229,7 @@ class NativeCodeCompiler: ] self._cache_dir = Path(env().cache_dir) / "native" self._cache_dir.mkdir(parents=True, exist_ok=True) + self._initialized = True def _get_compiler(self) -> str: """Get the path to the appropriate compiler. @@ -246,6 +250,8 @@ class NativeCodeCompiler: Returns: str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD). """ + if not self._initialized: + self._do_init() return self._device_arch def __call__(self, name: str, file: str, **kwds): @@ -290,6 +296,8 @@ class NativeCodeCompiler: >>> # Use the module to create an algorithm >>> algo = module.create_allreduce_algorithm(comm, buffer, size) """ + if not self._initialized: + self._do_init() if not os.path.isfile(file): raise FileNotFoundError(f"The specified source file does not exist: {file}") diff --git a/python/requirements_cuda11.txt b/python/requirements_cuda11.txt index 4e2e9371..a9786071 100644 --- a/python/requirements_cuda11.txt +++ b/python/requirements_cuda11.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_cuda12.txt b/python/requirements_cuda12.txt index e1c9b726..71572714 100644 --- a/python/requirements_cuda12.txt +++ b/python/requirements_cuda12.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt index 49cf13bc..95e99533 100644 --- a/python/requirements_cuda13.txt +++ b/python/requirements_cuda13.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt index 7ed4fef3..757d4e26 100644 --- a/python/requirements_rocm6.txt +++ b/python/requirements_rocm6.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt index be62aea9..e55711d2 100644 --- a/python/test/CMakeLists.txt +++ b/python/test/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) +find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED) include(FetchContent) FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0) FetchContent_MakeAvailable(nanobind) diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index bc29efd8..2a88a310 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -50,12 +50,6 @@ elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then fi make -C /root/mscclpp/tools/peer-access-test clean -if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cuda11.txt -elif [[ "${CUDA_VERSION}" == *"12."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cuda12.txt -fi - if [ "${PLATFORM}" == "rocm" ]; then export CXX=/opt/rocm/bin/hipcc fi @@ -65,7 +59,19 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})" echo "Using CMAKE_ARGS: ${CMAKE_ARGS}" fi -cd /root/mscclpp && pip3 install . + +cd /root/mscclpp +if [[ "${CUDA_VERSION}" == *"11."* ]]; then + pip3 install ".[cuda11,benchmark,test]" +elif [[ "${CUDA_VERSION}" == *"12."* ]]; then + pip3 install ".[cuda12,benchmark,test]" +elif [[ "${CUDA_VERSION}" == *"13."* ]]; then + pip3 install ".[cuda13,benchmark,test]" +elif [ "${PLATFORM}" == "rocm" ]; then + pip3 install ".[rocm6,benchmark,test]" +else + pip3 install ".[benchmark,test]" +fi pip3 install setuptools_scm python3 -m setuptools_scm --force-write-version-files From e874bf16663a30cd50c168d244211a4ab9c5b97c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 10:12:40 -0700 Subject: [PATCH 4/4] fix: isCuMemMapAllocated crashes on non-NVLS systems even with MSCCLPP_FORCE_DISABLE_NVLS=true (#790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [x] Fix `isCuMemMapAllocated()` to just return `true/false` without throwing when NVLS is not supported - [x] Fix `isNvlsSupported()` caching bug where `result`/`isChecked` were never updated - [x] Restore `[[maybe_unused]]` on `result` and `isChecked` statics — needed in HIP/ROCm env where `CUDA_NVLS_API_AVAILABLE` is not defined and the variables would otherwise be unused - [x] Run linter (`./tools/lint.sh`) --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com> --- src/core/gpu_utils.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 628d2dcb..09d5025d 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -283,7 +283,9 @@ bool isNvlsSupported() { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId)); MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); - return isMulticastSupported == 1; + result = (isMulticastSupported == 1); + isChecked = true; + return result; } return result; #endif @@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) { return false; } MSCCLPP_CUTHROW(cuMemRelease(handle)); - if (!isNvlsSupported()) { - throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage); - } return true; #endif }