Add CI for rocm (#346)

This commit is contained in:
Binyang Li
2024-09-15 15:30:54 -07:00
committed by GitHub
parent 7bedb25054
commit 0c7311e83f
5 changed files with 134 additions and 2 deletions

View File

@@ -0,0 +1,97 @@
trigger:
- main
pr:
branches:
include:
- main
drafts: false
jobs:
- job: IntegrationTestRocm
displayName: Integration test ROCm
strategy:
matrix:
rocm6.2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
pool:
name: mscclpp-rocm
container:
image: $[ variables['containerImage'] ]
options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallRcclTest
displayName: Install rccl-test
inputs:
targetType: 'inline'
script: |
git clone https://github.com/ROCm/rccl-tests.git
cd rccl-tests
make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallDep
displayName: Install dependencies
inputs:
targetType: 'inline'
script: |
set -e
git clone https://github.com/Azure/msccl-tools.git
cd msccl-tools
pip3 install .
- task: Bash@3
name: GenerateExectionFiles
displayName: Generate execution files
inputs:
targetType: 'inline'
script: |
set -e
git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/azure-mscclpp
cd azure-mscclpp
git checkout binyli/ci
mkdir execution-files
python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp allReduce test
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo /usr/local/mpi/bin/mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceWithExecutionFileTest
displayName: Run mscclpp allReduce with execution file
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo /usr/local/mpi/bin/mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-x ALLREDUCEPKT_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_packet.json \
-x ALLREDUCE_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_sm_mscclpp.json \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -0,0 +1,19 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
ENV RCCL_VERSION=rocm-6.2.0
ARG ARCH=gfx942
ENV ARCH_TARGET=${ARCH}
RUN cd /tmp && \
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
cd rccl && \
./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
cd .. && \
rm -rf /tmp/rccl
WORKDIR /

View File

@@ -5,6 +5,7 @@ LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
USER root
RUN rm -rf /opt/nvidia

View File

@@ -8,6 +8,7 @@ baseImageTable=(
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["rocm6.2"]="rocm/rocm-terminal:6.2"
)
declare -A extraLdPathTable
@@ -16,13 +17,14 @@ extraLdPathTable=(
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
["rocm6.2"]="/opt/rocm/lib"
)
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]"
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
@@ -36,12 +38,25 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd ${SCRIPT_DIR}/..
docker build -t ${GHCR}:base-${TARGET} \
docker build -t ${GHCR}-common:base-${TARGET} \
-f docker/base-x.dockerfile \
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} .
if [[ ${TARGET} == rocm* ]]; then
echo "Building ROCm base image..."
docker build -t ${GHCR}:base-${TARGET} \
-f docker/base-x-rocm.dockerfile \
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg ARCH="gfx942" .
else
echo "Building CUDA base image..."
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
fi
docker build -t ${GHCR}:base-dev-${TARGET} \
-f docker/base-dev-x.dockerfile \
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \

View File