diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml index c61854bb..ec2122c6 100644 --- a/.azure-pipelines/integration-test-rocm.yml +++ b/.azure-pipelines/integration-test-rocm.yml @@ -78,7 +78,7 @@ jobs: script: | set -e export PATH=/usr/local/mpi/bin:$PATH - sudo /usr/local/mpi/bin/mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \ + sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \ -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -90,7 +90,7 @@ jobs: script: | set -e export PATH=/usr/local/mpi/bin:$PATH - sudo /usr/local/mpi/bin/mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \ + sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \ -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \ -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \ -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \ diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index a758029b..97403c89 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -45,7 +45,7 @@ jobs: sudo apt-get update -y sudo apt-get install pssh -y curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - + - task: DownloadSecureFile@1 name: SshKeyFile displayName: Download key file diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 06aeee40..0d0c39d0 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -28,7 +28,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - version: [ 'cuda11.8', 'cuda12.2' ] + version: [ 'cuda11.8', 'cuda12.8' ] steps: - name: Checkout repository diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml index 287b5369..1db3105a 100644 --- a/.github/workflows/mscclpp-lang.yml +++ b/.github/workflows/mscclpp-lang.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ 'cuda11.8', 'cuda12.2' ] + version: [ 'cuda11.8', 'cuda12.8' ] steps: - uses: actions/checkout@v4 diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 26216711..5125dba6 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -10,7 +10,7 @@ RUN apt-get update && \ lcov \ vim \ && \ - apt-get autoremove && \ + apt-get autoremove -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* diff --git a/docker/base-x.dockerfile b/docker/base-x.dockerfile index 45f39c70..468362de 100644 --- a/docker/base-x.dockerfile +++ b/docker/base-x.dockerfile @@ -25,41 +25,34 @@ RUN apt-get update && \ python3-setuptools \ python3-wheel \ sudo \ - wget \ - && \ - apt-get autoremove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* + wget # Install OFED ARG OFED_VERSION=5.2-2.2.3.0 RUN cd /tmp && \ + ARCH=$(uname -m) && \ OS_VERSION=$(lsb_release -rs) && \ OS_VERSION=ubuntu${OS_VERSION} && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* -# Install OpenMPI -ENV OPENMPI_VERSION=4.1.5 -RUN cd /tmp && \ - export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \ - wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \ - tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \ - cd openmpi-${OPENMPI_VERSION} && \ - ./configure --prefix=/usr/local/mpi && \ - make -j && \ - make install && \ - cd .. && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* +# Install OpenMPI (should be done after the OFED installation) & clean apt cache +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libopenmpi-dev \ + && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* -ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64 -ENV PATH="/usr/local/mpi/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}" +# OpenMPI short link (for compatibility with old images) +RUN ln -s /usr/lib/x86_64-linux-gnu/openmpi /usr/local/mpi -RUN echo PATH="${PATH}" > /etc/environment && \ - echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment +ARG EXTRA_LD_PATH= +ENV LD_LIBRARY_PATH="${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}" +RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment ENTRYPOINT [] WORKDIR / diff --git a/docker/build.sh b/docker/build.sh index ff9fd581..bcc37eb0 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -9,6 +9,7 @@ baseImageTable=( ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" + ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" ["rocm6.2"]="rocm/rocm-terminal:6.2.1" ) @@ -23,13 +24,14 @@ extraLdPathTable=( declare -A ofedVersionTable ofedVersionTable=( ["cuda12.4"]="23.07-0.5.1.2" + ["cuda12.8"]="24.10-1.1.4.0" ) GHCR="ghcr.io/microsoft/mscclpp/mscclpp" TARGET=${1} print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|rocm6.2]" + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|rocm6.2]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then @@ -64,9 +66,11 @@ if [[ ${TARGET} == rocm* ]]; then --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ --build-arg TARGET=${TARGET} \ --build-arg ARCH="gfx942" . + docker rmi ${GHCR}-common:base-${TARGET} else echo "Building CUDA base image..." docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET} + docker rmi --no-prune ${GHCR}-common:base-${TARGET} fi docker build -t ${GHCR}:base-dev-${TARGET} \ diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index a5def0f7..85e082dd 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -1,45 +1,46 @@ set -e HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi +export PATH=/usr/local/mpi/bin:$PATH function run_mscclpp_test() { echo "=================Run allgather_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl # For kernel 2, the message size must can be divided by 3 - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl echo "==================Run allreduce_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl echo "==================Run alltoall_test_perf on 2 nodes=========================" - /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ + mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \ -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl @@ -51,12 +52,12 @@ function run_mscclpp_test() function run_mp_ut() { echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)=========================" - /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \ + mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \ -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)=========================" - /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ + mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003 } @@ -64,7 +65,7 @@ function run_mp_ut() function run_pytests() { echo "==================Run python tests================================" - /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ + mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \ -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh } @@ -72,7 +73,7 @@ function run_pytests() function run_py_benchmark() { echo "==================Run python benchmark================================" - /usr/local/mpi/bin/mpirun -allow-run-as-root -np 16 --bind-to numa \ + mpirun -allow-run-as-root -np 16 --bind-to numa \ -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \ -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \