mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 14:59:29 +00:00
@@ -1,114 +0,0 @@
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: IntegrationTestRocm
|
||||
displayName: Integration test ROCm
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6.2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
pool:
|
||||
name: mscclpp-rocm
|
||||
container:
|
||||
image: $[ variables['containerImage'] ]
|
||||
options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallRcclTest
|
||||
displayName: Install rccl-test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
git clone https://github.com/ROCm/rccl-tests.git
|
||||
cd rccl-tests
|
||||
make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallDep
|
||||
displayName: Install dependencies
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
git clone https://github.com/Azure/msccl-tools.git
|
||||
cd msccl-tools
|
||||
pip3 install .
|
||||
|
||||
- task: Bash@3
|
||||
name: GenerateExectionFiles
|
||||
displayName: Generate execution files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
|
||||
cd msccl-users
|
||||
mkdir execution-files
|
||||
python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
|
||||
python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp allReduce test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
|
||||
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceWithExecutionFileTest
|
||||
displayName: Run mscclpp allReduce with execution file
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
|
||||
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
|
||||
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
|
||||
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
|
||||
-b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
@@ -5,6 +5,9 @@ parameters:
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
@@ -16,7 +19,11 @@ steps:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
if [ "${{ parameters.platform }}" == "rocm" ]; then
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
else
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
fi
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -52,7 +59,7 @@ steps:
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
arguments: "single-node-test true ${{ parameters.platform }}"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
@@ -119,7 +126,7 @@ steps:
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
50
.azure-pipelines/ut-rocm.yml
Normal file
50
.azure-pipelines/ut-rocm.yml
Normal file
@@ -0,0 +1,50 @@
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: UnitTestMI300X
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci-mi300x
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6_2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
@@ -9,7 +9,7 @@
|
||||
|--------------------------|-------------------|
|
||||
| Unit Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Integration Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
|
||||
| Integration Tests (ROCm) | [](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
|
||||
| Unit Tests (ROCm) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
|
||||
|
||||
A GPU-driven communication stack for scalable AI applications.
|
||||
|
||||
|
||||
@@ -24,6 +24,16 @@ RUN OS_ARCH=$(uname -m) && \
|
||||
rm -rf ${CMAKE_HOME}.tar.gz && \
|
||||
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
|
||||
|
||||
# Install ROCm-specific packages if building for ROCm
|
||||
ARG TARGET="cuda13.0"
|
||||
RUN if echo "$TARGET" | grep -q "^rocm"; then \
|
||||
apt-get update -y && \
|
||||
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*; \
|
||||
fi
|
||||
|
||||
# Create Python venv
|
||||
RUN python3 -m venv /root/venv && \
|
||||
echo 'source /root/venv/bin/activate' >> /root/.bashrc
|
||||
@@ -32,8 +42,10 @@ ENV PATH="/root/venv/bin:${PATH}"
|
||||
# Install Python dependencies
|
||||
ADD . /tmp/mscclpp
|
||||
WORKDIR /tmp/mscclpp
|
||||
ARG TARGET="cuda13.0"
|
||||
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
|
||||
if echo "$TARGET" | grep -q "^rocm"; then \
|
||||
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
|
||||
fi && \
|
||||
pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r python/requirements_${target_type}.txt
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
ARG BASE_IMAGE
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ENV RCCL_VERSION=rocm-6.2.0
|
||||
ARG GPU_ARCH=gfx942
|
||||
ENV ARCH_TARGET=${GPU_ARCH}
|
||||
RUN cd /tmp && \
|
||||
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
|
||||
cd rccl && \
|
||||
./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
|
||||
cd .. && \
|
||||
rm -rf /tmp/rccl
|
||||
|
||||
WORKDIR /
|
||||
@@ -12,7 +12,7 @@ baseImageTable=(
|
||||
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
|
||||
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
|
||||
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
|
||||
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
|
||||
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
|
||||
)
|
||||
|
||||
declare -A extraLdPathTable
|
||||
@@ -29,6 +29,7 @@ ofedVersionTable=(
|
||||
["cuda12.8"]="24.10-1.1.4.0"
|
||||
["cuda12.9"]="24.10-1.1.4.0"
|
||||
["cuda13.0"]="24.10-3.2.5.0"
|
||||
["rocm6.2"]="24.10-1.1.4.0"
|
||||
)
|
||||
|
||||
TARGET=${1}
|
||||
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \
|
||||
|
||||
if [[ ${TARGET} == rocm* ]]; then
|
||||
echo "Building ROCm base image..."
|
||||
docker build -t ${TAG_BASE} \
|
||||
-f docker/base-x-rocm.dockerfile \
|
||||
--build-arg BASE_IMAGE=${TAG_TMP} \
|
||||
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
||||
--build-arg TARGET=${TARGET} \
|
||||
--build-arg GPU_ARCH="gfx942" .
|
||||
docker rmi ${TAG_TMP}
|
||||
else
|
||||
echo "Building CUDA base image..."
|
||||
docker tag ${TAG_TMP} ${TAG_BASE}
|
||||
docker rmi --no-prune ${TAG_TMP}
|
||||
fi
|
||||
docker tag ${TAG_TMP} ${TAG_BASE}
|
||||
docker rmi --no-prune ${TAG_TMP}
|
||||
|
||||
docker build -t ${TAG_BASE_DEV} \
|
||||
-f docker/base-dev-x.dockerfile \
|
||||
|
||||
@@ -6,4 +6,5 @@ pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
blake3
|
||||
blake3
|
||||
pybind11
|
||||
@@ -0,0 +1,10 @@
|
||||
mpi4py==4.1.1
|
||||
cupy==13.6.0
|
||||
prettytable
|
||||
netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
blake3
|
||||
pybind11
|
||||
@@ -14,6 +14,9 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
|
||||
if(MSCCLPP_USE_ROCM)
|
||||
file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
|
||||
set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
|
||||
foreach(arch ${MSCCLPP_GPU_ARCHS})
|
||||
add_compile_options(--offload-arch=${arch})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
function(add_test_executable name sources)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
set -e
|
||||
|
||||
# get parameter from $1 and $2
|
||||
TEST_NAME=$1
|
||||
IB_ENVIRONMENT="${2:-true}"
|
||||
PLATFORM="${3:-cuda}"
|
||||
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
|
||||
@@ -35,20 +35,29 @@ set -e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
|
||||
fi
|
||||
|
||||
# force to pull the latest image
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker pull ${CONTAINERIMAGE}"
|
||||
|
||||
LAUNCH_OPTION="--gpus=all"
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
|
||||
fi
|
||||
if [ "${IB_ENVIRONMENT}" == "true" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
else
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
||||
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
fi
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
|
||||
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
set -e
|
||||
|
||||
PLATFORM="${1:-cuda}"
|
||||
|
||||
mkdir -p /root/.ssh
|
||||
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
||||
chown root:root /root/.ssh/authorized_keys
|
||||
@@ -8,10 +10,12 @@ chown root:root /root/.ssh/config
|
||||
chmod 400 /root/mscclpp/sshkey
|
||||
chown root:root /root/mscclpp/sshkey
|
||||
|
||||
nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
||||
done
|
||||
if [ "${PLATFORM}" == "cuda" ]; then
|
||||
nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
||||
done
|
||||
fi
|
||||
|
||||
make -C /root/mscclpp/tools/peer-access-test
|
||||
/root/mscclpp/tools/peer-access-test/peer_access_test
|
||||
@@ -19,10 +23,13 @@ make -C /root/mscclpp/tools/peer-access-test clean
|
||||
|
||||
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
|
||||
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
|
||||
else
|
||||
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
|
||||
pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
|
||||
fi
|
||||
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
export CXX=/opt/rocm/bin/hipcc
|
||||
fi
|
||||
cd /root/mscclpp && pip3 install .
|
||||
pip3 install setuptools_scm
|
||||
python3 -m setuptools_scm --force-write-version-files
|
||||
|
||||
Reference in New Issue
Block a user