create CI pipeline for rocm (#718)

Create CI pipeline for AMD GPU.
2026-04-20 14:59:29 +00:00 · 2026-02-09 16:55:16 -08:00
parent d7925448f3
commit c12822a7af
12 changed files with 118 additions and 158 deletions
--- a/.azure-pipelines/integration-test-rocm.yml
+++ b/.azure-pipelines/integration-test-rocm.yml
@@ -1,114 +0,0 @@
-trigger:
-  branches:
-    include:
-    - main
-    - release/*
-  paths:
-    exclude:
-    - .devcontainer/**
-    - .github/**
-    - docker/**
-    - docs/**
-    - '**/*.md'
-
-pr:
-  branches:
-    include:
-    - main
-    - release/*
-  drafts: false
-  paths:
-    exclude:
-      - .devcontainer/**
-      - .github/**
-      - docker/**
-      - docs/**
-      - '**/*.md'
-
-jobs:
- job: IntegrationTestRocm
-  displayName: Integration test ROCm
-  strategy:
-    matrix:
-      rocm6.2:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
-
-  pool:
-    name: mscclpp-rocm
-  container:
-    image: $[ variables['containerImage'] ]
-    options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
-
-  steps:
-  - task: Bash@3
-    name: Build
-    displayName: Build
-    inputs:
-      targetType: 'inline'
-      script: |
-        mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
-        make -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallRcclTest
-    displayName: Install rccl-test
-    inputs:
-      targetType: 'inline'
-      script: |
-        git clone https://github.com/ROCm/rccl-tests.git
-        cd rccl-tests
-        make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: InstallDep
-    displayName: Install dependencies
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://github.com/Azure/msccl-tools.git
-      cd msccl-tools
-      pip3 install .
-
-  - task: Bash@3
-    name: GenerateExectionFiles
-    displayName: Generate execution files
-    inputs:
-     targetType: 'inline'
-     script: |
-      set -e
-      git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
-      cd msccl-users
-      mkdir execution-files
-      python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
-      python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
-
-  - task: Bash@3
-    name: AllReduceTest
-    displayName: Run mscclpp allReduce test
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
-  - task: Bash@3
-    name: AllReduceWithExecutionFileTest
-    displayName: Run mscclpp allReduce with execution file
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        export PATH=/usr/local/mpi/bin:$PATH
-        sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-          -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-          -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-          -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-          -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/.azure-pipelines/templates/ut.yaml
+++ b/.azure-pipelines/templates/ut.yaml
@@ -5,6 +5,9 @@ parameters:
  type: string
 - name: sshKeySecureFile
  type: string
+- name: platform
+  type: string
+  default: 'cuda'
 - name: gpuArch
  type: string

@@ -16,7 +19,11 @@ steps:
    targetType: 'inline'
    script: |
      mkdir build && cd build
-      cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      if [ "${{ parameters.platform }}" == "rocm" ]; then
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      else
+        cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
+      fi
      make -j
    workingDirectory: '$(System.DefaultWorkingDirectory)'

@@ -52,7 +59,7 @@ steps:
  inputs:
    targetType: filePath
    filePath: test/deploy/deploy.sh
-    arguments: "single-node-test"
+    arguments: "single-node-test true ${{ parameters.platform }}"
    workingDirectory: '$(System.DefaultWorkingDirectory)'


@@ -119,7 +126,7 @@ steps:
        export PATH=/usr/local/mpi/bin:\$PATH                          \
        export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH;  \
        cd /root/mscclpp;                                              \
-        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
+        mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
      kill $CHILD_PID
    workingDirectory: '$(System.DefaultWorkingDirectory)'

--- a/.azure-pipelines/ut-rocm.yml
+++ b/.azure-pipelines/ut-rocm.yml
@@ -0,0 +1,50 @@
+trigger:
+  branches:
+    include:
+    - main
+    - release/*
+  paths:
+    exclude:
+    - .devcontainer/**
+    - .github/**
+    - apps/**
+    - docker/**
+    - docs/**
+    - '**/*.md'
+
+pr:
+  branches:
+    include:
+    - main
+    - release/*
+  drafts: false
+  paths:
+    exclude:
+      - .devcontainer/**
+      - .github/**
+      - apps/**
+      - docker/**
+      - docs/**
+      - '**/*.md'
+
+jobs:
+- job: UnitTestMI300X
+  timeoutInMinutes: 40
+  pool:
+    name: msccl-ci-mi300x
+  strategy:
+    matrix:
+      rocm6_2:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut.yaml
+    parameters:
+      subscription:     mscclpp-ci-mi300x
+      vmssName:         mscclpp-mi300x-ci
+      sshKeySecureFile: mscclpp.pem
+      platform:         rocm
+      gpuArch:          gfx942
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 |--------------------------|-------------------|
 | Unit Tests (CUDA)        | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
 | Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
-| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
+| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |

 A GPU-driven communication stack for scalable AI applications.

--- a/docker/base-dev-x.dockerfile
+++ b/docker/base-dev-x.dockerfile
@@ -24,6 +24,16 @@ RUN OS_ARCH=$(uname -m) && \
    rm -rf ${CMAKE_HOME}.tar.gz && \
    ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/

+# Install ROCm-specific packages if building for ROCm
+ARG TARGET="cuda13.0"
+RUN if echo "$TARGET" | grep -q "^rocm"; then \
+        apt-get update -y && \
+        apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* /tmp/*; \
+    fi
+
 # Create Python venv
 RUN python3 -m venv /root/venv && \
    echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,8 +42,10 @@ ENV PATH="/root/venv/bin:${PATH}"
 # Install Python dependencies
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
-ARG TARGET="cuda13.0"
 RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    if echo "$TARGET" | grep -q "^rocm"; then \
+        export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
+    fi && \
    pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r python/requirements_${target_type}.txt

--- a/docker/base-x-rocm.dockerfile
+++ b/docker/base-x-rocm.dockerfile
@@ -1,19 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV RCCL_VERSION=rocm-6.2.0
-ARG GPU_ARCH=gfx942
-ENV ARCH_TARGET=${GPU_ARCH}
-RUN cd /tmp && \
-    git clone --branch ${RCCL_VERSION} --depth 1  https://github.com/ROCm/rccl.git && \
-    cd rccl && \
-    ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
-    cd .. && \
-    rm -rf /tmp/rccl
-
-WORKDIR /
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -12,7 +12,7 @@ baseImageTable=(
    ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
    ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
    ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
-    ["rocm6.2"]="rocm/rocm-terminal:6.2.1"
+    ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
 )

 declare -A extraLdPathTable
@@ -29,6 +29,7 @@ ofedVersionTable=(
    ["cuda12.8"]="24.10-1.1.4.0"
    ["cuda12.9"]="24.10-1.1.4.0"
    ["cuda13.0"]="24.10-3.2.5.0"
+    ["rocm6.2"]="24.10-1.1.4.0"
 )

 TARGET=${1}
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \

 if [[ ${TARGET} == rocm* ]]; then
    echo "Building ROCm base image..."
-    docker build -t ${TAG_BASE} \
-        -f docker/base-x-rocm.dockerfile \
-        --build-arg BASE_IMAGE=${TAG_TMP} \
-        --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
-        --build-arg TARGET=${TARGET} \
-        --build-arg GPU_ARCH="gfx942" .
-    docker rmi ${TAG_TMP}
 else
    echo "Building CUDA base image..."
-    docker tag ${TAG_TMP} ${TAG_BASE}
-    docker rmi --no-prune ${TAG_TMP}
 fi
+docker tag ${TAG_TMP} ${TAG_BASE}
+docker rmi --no-prune ${TAG_TMP}

 docker build -t ${TAG_BASE_DEV} \
    -f docker/base-dev-x.dockerfile \
--- a/python/requirements_cuda13.txt
+++ b/python/requirements_cuda13.txt
@@ -6,4 +6,5 @@ pytest
 numpy
 matplotlib
 sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
-blake3
+blake3
+pybind11
--- a/python/requirements_rocm6.txt
+++ b/python/requirements_rocm6.txt
@@ -0,0 +1,10 @@
+mpi4py==4.1.1
+cupy==13.6.0
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
+sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
+blake3
+pybind11
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,6 +14,9 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
 if(MSCCLPP_USE_ROCM)
    file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
    set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
+    foreach(arch ${MSCCLPP_GPU_ARCHS})
+        add_compile_options(--offload-arch=${arch})
+    endforeach()
 endif()

 function(add_test_executable name sources)
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,8 +1,8 @@
 set -e

-# get parameter from $1 and $2
 TEST_NAME=$1
 IB_ENVIRONMENT="${2:-true}"
+PLATFORM="${3:-cuda}"

 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
 ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
@@ -35,20 +35,29 @@ set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
 parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}

+if [ "${PLATFORM}" == "rocm" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
  "sudo docker pull ${CONTAINERIMAGE}"
+
+LAUNCH_OPTION="--gpus=all"
+if [ "${PLATFORM}" == "rocm" ]; then
+  LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
+fi
 if [ "${IB_ENVIRONMENT}" == "true" ]; then
  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
+    "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
    -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
    --entrypoint /bin/bash ${CONTAINERIMAGE}"
 else
  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-    "sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
+    "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
    -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
    --entrypoint /bin/bash ${CONTAINERIMAGE}"
 fi
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
+  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"

--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -1,5 +1,7 @@
 set -e

+PLATFORM="${1:-cuda}"
+
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
@@ -8,10 +10,12 @@ chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey

-nvidia-smi -pm 1
-for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
-    nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
-done
+if [ "${PLATFORM}" == "cuda" ]; then
+    nvidia-smi -pm 1
+    for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
+        nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
+    done
+fi

 make -C /root/mscclpp/tools/peer-access-test
 /root/mscclpp/tools/peer-access-test/peer_access_test
@@ -19,10 +23,13 @@ make -C /root/mscclpp/tools/peer-access-test clean

 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
    pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
-else
+elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
    pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
 fi

+if [ "${PLATFORM}" == "rocm" ]; then
+    export CXX=/opt/rocm/bin/hipcc
+fi
 cd /root/mscclpp && pip3 install .
 pip3 install setuptools_scm
 python3 -m setuptools_scm --force-write-version-files