From 56bdbc2f329623cf8811021114dcac3dee948904 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Mon, 10 Jul 2023 13:19:14 +0800 Subject: [PATCH] Enable test for both cuda11 and cuda12 (#124) Update pipeline: enable test for both cuda11 and cuda12 --- .azure-pipelines/integration-test.yml | 182 ++++++++++++------------ .azure-pipelines/multi-nodes-test.yml | 195 ++++++++++++++------------ .azure-pipelines/ut.yml | 9 +- .github/workflows/codeql.yml | 3 +- test/deploy/deploy.sh | 4 +- 5 files changed, 210 insertions(+), 183 deletions(-) diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 1b29c932..c7491569 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -7,96 +7,106 @@ pr: - main drafts: false -pool: - name: mscclpp -container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 +jobs: +- job: IntegrationTest + displayName: Integration test + strategy: + matrix: + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp - mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' + pool: + name: mscclpp + container: + image: $[ variables['containerImage'] ] + options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 -- task: Bash@3 - name: LockGPUClock - displayName: Lock GPU clock frequency - inputs: - targetType: 'inline' - script: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - workingDirectory: '$(System.DefaultWorkingDirectory)' + steps: + - task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz + tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp + mkdir build && cd build + MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: AllGatherTest - displayName: Run mscclpp AllGather test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: LockGPUClock + displayName: Lock GPU clock frequency + inputs: + targetType: 'inline' + script: | + sudo nvidia-smi -pm 1 + for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do + sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i + done + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: SendRecvTest - displayName: Run mscclpp SendRecv test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: AllGatherTest + displayName: Run mscclpp AllGather test + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: AllReduceTest - displayName: Run mscclpp AllReduce test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: SendRecvTest + displayName: Run mscclpp SendRecv test + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: AllToAll - displayName: Run mscclpp AllToAll test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: AllReduceTest + displayName: Run mscclpp AllReduce test + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: CheckPerfNumber - displayName: Check collective primitives performance - inputs: - targetType: 'inline' - script: | - set -e - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: AllToAll + displayName: Run mscclpp AllToAll test + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: CheckPerfNumber + displayName: Check collective primitives performance + inputs: + targetType: 'inline' + script: | + set -e + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index f2997882..8aab0d1a 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -4,104 +4,113 @@ trigger: # Do not run multi-nodes-test for PR, we can trigger it manually pr: none -pool: - name: mscclpp-it -container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +jobs: +- job: MultiNodesTest + displayName: Multi nodes test + strategy: + matrix: + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + pool: + name: mscclpp-it + container: + image: $[ variables['containerImage'] ] -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp - mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' + steps: + - task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz + tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp + mkdir build && cd build + MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ssh.key + - task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: ssh.key -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + - task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: mscclpp - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclpp-it-vmss --resource-group msccl-dev + - task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: mscclpp + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name mscclpp-it-vmss --resource-group msccl-dev -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + workingDirectory: '$(System.DefaultWorkingDirectory)' -- task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclpp-it-000000 - tail -f output/mscclpp-it-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test' - kill $CHILD_PID + - task: Bash@3 + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclpp-it-000000 + tail -f output/mscclpp-it-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test' + kill $CHILD_PID -- task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclpp-it-000000 - tail -f output/mscclpp-it-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut' - kill $CHILD_PID + - task: Bash@3 + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclpp-it-000000 + tail -f output/mscclpp-it-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut' + kill $CHILD_PID -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: mscclpp - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev + - task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: mscclpp + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 76e31456..b31ad8ad 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -12,8 +12,15 @@ jobs: timeoutInMinutes: 30 pool: name: mscclpp + strategy: + matrix: + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + image: $[ variables['containerImage'] ] options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 9f46b386..3cc55e24 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,7 +12,7 @@ jobs: name: Analyze runs-on: 'ubuntu-latest' container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }} permissions: actions: read @@ -23,6 +23,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] + cuda-version: [ 'cuda11.8', 'cuda12.1' ] steps: - name: Checkout repository diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index eb8e75db..51c86d7b 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -33,11 +33,11 @@ parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${MSCCLPP # force to pull the latest image parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1" + "sudo docker pull ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ -w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \ - --entrypoint /bin/bash ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1" + --entrypoint /bin/bash ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'"