mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
Add CI for rocm (#346)
This commit is contained in:
97
.azure-pipelines/integration-test-rocm.yml
Normal file
97
.azure-pipelines/integration-test-rocm.yml
Normal file
@@ -0,0 +1,97 @@
|
||||
trigger:
|
||||
- main
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
drafts: false
|
||||
|
||||
jobs:
|
||||
- job: IntegrationTestRocm
|
||||
displayName: Integration test ROCm
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6.2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
pool:
|
||||
name: mscclpp-rocm
|
||||
container:
|
||||
image: $[ variables['containerImage'] ]
|
||||
options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallRcclTest
|
||||
displayName: Install rccl-test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
git clone https://github.com/ROCm/rccl-tests.git
|
||||
cd rccl-tests
|
||||
make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallDep
|
||||
displayName: Install dependencies
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
git clone https://github.com/Azure/msccl-tools.git
|
||||
cd msccl-tools
|
||||
pip3 install .
|
||||
|
||||
- task: Bash@3
|
||||
name: GenerateExectionFiles
|
||||
displayName: Generate execution files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/azure-mscclpp
|
||||
cd azure-mscclpp
|
||||
git checkout binyli/ci
|
||||
mkdir execution-files
|
||||
python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
|
||||
python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp allReduce test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
sudo /usr/local/mpi/bin/mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \
|
||||
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceWithExecutionFileTest
|
||||
displayName: Run mscclpp allReduce with execution file
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
sudo /usr/local/mpi/bin/mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
|
||||
-x ALLREDUCEPKT_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_packet.json \
|
||||
-x ALLREDUCE_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_sm_mscclpp.json \
|
||||
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
|
||||
-b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
19
docker/base-x-rocm.dockerfile
Normal file
19
docker/base-x-rocm.dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
ARG BASE_IMAGE
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ENV RCCL_VERSION=rocm-6.2.0
|
||||
ARG ARCH=gfx942
|
||||
ENV ARCH_TARGET=${ARCH}
|
||||
RUN cd /tmp && \
|
||||
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
|
||||
cd rccl && \
|
||||
./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
|
||||
cd .. && \
|
||||
rm -rf /tmp/rccl
|
||||
|
||||
WORKDIR /
|
||||
@@ -5,6 +5,7 @@ LABEL maintainer="MSCCL++"
|
||||
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
USER root
|
||||
|
||||
RUN rm -rf /opt/nvidia
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ baseImageTable=(
|
||||
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
|
||||
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
|
||||
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
|
||||
["rocm6.2"]="rocm/rocm-terminal:6.2"
|
||||
)
|
||||
|
||||
declare -A extraLdPathTable
|
||||
@@ -16,13 +17,14 @@ extraLdPathTable=(
|
||||
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
|
||||
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
|
||||
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
|
||||
["rocm6.2"]="/opt/rocm/lib"
|
||||
)
|
||||
|
||||
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
|
||||
TARGET=${1}
|
||||
|
||||
print_usage() {
|
||||
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]"
|
||||
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|rocm6.2]"
|
||||
}
|
||||
|
||||
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
|
||||
@@ -36,12 +38,25 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
cd ${SCRIPT_DIR}/..
|
||||
|
||||
docker build -t ${GHCR}:base-${TARGET} \
|
||||
docker build -t ${GHCR}-common:base-${TARGET} \
|
||||
-f docker/base-x.dockerfile \
|
||||
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
|
||||
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
||||
--build-arg TARGET=${TARGET} .
|
||||
|
||||
if [[ ${TARGET} == rocm* ]]; then
|
||||
echo "Building ROCm base image..."
|
||||
docker build -t ${GHCR}:base-${TARGET} \
|
||||
-f docker/base-x-rocm.dockerfile \
|
||||
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
|
||||
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
||||
--build-arg TARGET=${TARGET} \
|
||||
--build-arg ARCH="gfx942" .
|
||||
else
|
||||
echo "Building CUDA base image..."
|
||||
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
|
||||
fi
|
||||
|
||||
docker build -t ${GHCR}:base-dev-${TARGET} \
|
||||
-f docker/base-dev-x.dockerfile \
|
||||
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
|
||||
|
||||
0
python/requirements_rocm6.txt
Normal file
0
python/requirements_rocm6.txt
Normal file
Reference in New Issue
Block a user