create CI pipeline for rocm (#718)

Create CI pipeline for AMD GPU.
This commit is contained in:
Binyang Li
2026-02-09 16:55:16 -08:00
committed by GitHub
parent d7925448f3
commit c12822a7af
12 changed files with 118 additions and 158 deletions

View File

@@ -1,114 +0,0 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: IntegrationTestRocm
displayName: Integration test ROCm
strategy:
matrix:
rocm6.2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
pool:
name: mscclpp-rocm
container:
image: $[ variables['containerImage'] ]
options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallRcclTest
displayName: Install rccl-test
inputs:
targetType: 'inline'
script: |
git clone https://github.com/ROCm/rccl-tests.git
cd rccl-tests
make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallDep
displayName: Install dependencies
inputs:
targetType: 'inline'
script: |
set -e
git clone https://github.com/Azure/msccl-tools.git
cd msccl-tools
pip3 install .
- task: Bash@3
name: GenerateExectionFiles
displayName: Generate execution files
inputs:
targetType: 'inline'
script: |
set -e
git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
cd msccl-users
mkdir execution-files
python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp allReduce test
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceWithExecutionFileTest
displayName: Run mscclpp allReduce with execution file
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -5,6 +5,9 @@ parameters:
type: string
- name: sshKeySecureFile
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
@@ -16,7 +19,11 @@ steps:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
else
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
fi
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -52,7 +59,7 @@ steps:
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
arguments: "single-node-test true ${{ parameters.platform }}"
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -119,7 +126,7 @@ steps:
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -0,0 +1,50 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
platform: rocm
gpuArch: gfx942

View File

@@ -9,7 +9,7 @@
|--------------------------|-------------------|
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
A GPU-driven communication stack for scalable AI applications.

View File

@@ -24,6 +24,16 @@ RUN OS_ARCH=$(uname -m) && \
rm -rf ${CMAKE_HOME}.tar.gz && \
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
# Install ROCm-specific packages if building for ROCm
ARG TARGET="cuda13.0"
RUN if echo "$TARGET" | grep -q "^rocm"; then \
apt-get update -y && \
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*; \
fi
# Create Python venv
RUN python3 -m venv /root/venv && \
echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,8 +42,10 @@ ENV PATH="/root/venv/bin:${PATH}"
# Install Python dependencies
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda13.0"
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
if echo "$TARGET" | grep -q "^rocm"; then \
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
fi && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r python/requirements_${target_type}.txt

View File

@@ -1,19 +0,0 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
ENV RCCL_VERSION=rocm-6.2.0
ARG GPU_ARCH=gfx942
ENV ARCH_TARGET=${GPU_ARCH}
RUN cd /tmp && \
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
cd rccl && \
./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
cd .. && \
rm -rf /tmp/rccl
WORKDIR /

View File

@@ -12,7 +12,7 @@ baseImageTable=(
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
)
declare -A extraLdPathTable
@@ -29,6 +29,7 @@ ofedVersionTable=(
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
["cuda13.0"]="24.10-3.2.5.0"
["rocm6.2"]="24.10-1.1.4.0"
)
TARGET=${1}
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \
if [[ ${TARGET} == rocm* ]]; then
echo "Building ROCm base image..."
docker build -t ${TAG_BASE} \
-f docker/base-x-rocm.dockerfile \
--build-arg BASE_IMAGE=${TAG_TMP} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg GPU_ARCH="gfx942" .
docker rmi ${TAG_TMP}
else
echo "Building CUDA base image..."
docker tag ${TAG_TMP} ${TAG_BASE}
docker rmi --no-prune ${TAG_TMP}
fi
docker tag ${TAG_TMP} ${TAG_BASE}
docker rmi --no-prune ${TAG_TMP}
docker build -t ${TAG_BASE_DEV} \
-f docker/base-dev-x.dockerfile \

View File

@@ -6,4 +6,5 @@ pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
blake3
blake3
pybind11

View File

@@ -0,0 +1,10 @@
mpi4py==4.1.1
cupy==13.6.0
prettytable
netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
blake3
pybind11

View File

@@ -14,6 +14,9 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
if(MSCCLPP_USE_ROCM)
file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
foreach(arch ${MSCCLPP_GPU_ARCHS})
add_compile_options(--offload-arch=${arch})
endforeach()
endif()
function(add_test_executable name sources)

View File

@@ -1,8 +1,8 @@
set -e
# get parameter from $1 and $2
TEST_NAME=$1
IB_ENVIRONMENT="${2:-true}"
PLATFORM="${3:-cuda}"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
@@ -35,20 +35,29 @@ set -e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
if [ "${PLATFORM}" == "rocm" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
fi
# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"
LAUNCH_OPTION="--gpus=all"
if [ "${PLATFORM}" == "rocm" ]; then
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
fi
if [ "${IB_ENVIRONMENT}" == "true" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
else
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
fi
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"

View File

@@ -1,5 +1,7 @@
set -e
PLATFORM="${1:-cuda}"
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys
@@ -8,10 +10,12 @@ chown root:root /root/.ssh/config
chmod 400 /root/mscclpp/sshkey
chown root:root /root/mscclpp/sshkey
nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
if [ "${PLATFORM}" == "cuda" ]; then
nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
fi
make -C /root/mscclpp/tools/peer-access-test
/root/mscclpp/tools/peer-access-test/peer_access_test
@@ -19,10 +23,13 @@ make -C /root/mscclpp/tools/peer-access-test clean
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
else
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
fi
if [ "${PLATFORM}" == "rocm" ]; then
export CXX=/opt/rocm/bin/hipcc
fi
cd /root/mscclpp && pip3 install .
pip3 install setuptools_scm
python3 -m setuptools_scm --force-write-version-files