From 6ee4e803178b628c7b80683d68ceac1712ceaa8b Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Tue, 13 Jun 2023 14:34:07 +0800 Subject: [PATCH] Create Azure pipeline for multi-node tests (#97) Create Azure pipeline to run mscclpp-test on multi-nodes --- .azure-pipelines/multi-nodes-test.yml | 84 +++++++++++++++++++++++++++ test/mscclpp-test/deploy/config | 8 +++ test/mscclpp-test/deploy/deploy.sh | 38 ++++++++++++ test/mscclpp-test/deploy/hostfile | 2 + test/mscclpp-test/deploy/hostfile_mpi | 2 + test/mscclpp-test/deploy/run_tests.sh | 16 +++++ test/mscclpp-test/deploy/setup.sh | 15 +++++ 7 files changed, 165 insertions(+) create mode 100644 .azure-pipelines/multi-nodes-test.yml create mode 100644 test/mscclpp-test/deploy/config create mode 100644 test/mscclpp-test/deploy/deploy.sh create mode 100644 test/mscclpp-test/deploy/hostfile create mode 100644 test/mscclpp-test/deploy/hostfile_mpi create mode 100644 test/mscclpp-test/deploy/run_tests.sh create mode 100644 test/mscclpp-test/deploy/setup.sh diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml new file mode 100644 index 00000000..b388c9d7 --- /dev/null +++ b/.azure-pipelines/multi-nodes-test.yml @@ -0,0 +1,84 @@ +trigger: +- main + +# Do not run multi-nodes-test for PR, we can trigger it manually +pr: none + +pool: + name: mscclpp-it +container: + image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + +steps: +- task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz + tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp + mkdir build && cd build + MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: ssh.key + +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: mscclpp + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name mscclpp-it-vmss --resource-group msccl-dev + +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/mscclpp-test/deploy/deploy.sh + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: RunTests + displayName: Run multi-nodes test + inputs: + targetType: 'inline' + script: | + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + mkdir -p output + touch output/mscclpp-it-000000 + tail -f output/mscclpp-it-000000 & + parallel-ssh -i -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh' + +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: mscclpp + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev diff --git a/test/mscclpp-test/deploy/config b/test/mscclpp-test/deploy/config new file mode 100644 index 00000000..80ac1132 --- /dev/null +++ b/test/mscclpp-test/deploy/config @@ -0,0 +1,8 @@ +Host mscclpp-it-000000 + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no +Host mscclpp-it-000001 + Port 22345 + IdentityFile /root/mscclpp/sshkey + StrictHostKeyChecking no diff --git a/test/mscclpp-test/deploy/deploy.sh b/test/mscclpp-test/deploy/deploy.sh new file mode 100644 index 00000000..7211712c --- /dev/null +++ b/test/mscclpp-test/deploy/deploy.sh @@ -0,0 +1,38 @@ +set -e + +KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/build" +DST_DIR="/tmp/mscclpp" +HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy/hostfile" +DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy" +SSH_OPTION="StrictHostKeyChecking=no" + +chmod 400 ${KeyFilePath} +ssh-keygen -t rsa -f sshkey -P "" + +while true; do + set +e + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname" + if [ $? -eq 0 ]; then + break + fi + echo "Waiting for sshd to start..." + sleep 5 +done + +set -e +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "rm -rf ${DST_DIR}" +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "mkdir -p ${DST_DIR}" +parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_DIR} ${DST_DIR} + +parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR} +parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR} +parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR} + +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ + -w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \ + --entrypoint /bin/bash ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1" +parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'" + diff --git a/test/mscclpp-test/deploy/hostfile b/test/mscclpp-test/deploy/hostfile new file mode 100644 index 00000000..a64d0f91 --- /dev/null +++ b/test/mscclpp-test/deploy/hostfile @@ -0,0 +1,2 @@ +azureuser@mscclpp-it-000000 +azureuser@mscclpp-it-000001 diff --git a/test/mscclpp-test/deploy/hostfile_mpi b/test/mscclpp-test/deploy/hostfile_mpi new file mode 100644 index 00000000..46114baf --- /dev/null +++ b/test/mscclpp-test/deploy/hostfile_mpi @@ -0,0 +1,2 @@ +mscclpp-it-000000 +mscclpp-it-000001 diff --git a/test/mscclpp-test/deploy/run_tests.sh b/test/mscclpp-test/deploy/run_tests.sh new file mode 100644 index 00000000..4045cf68 --- /dev/null +++ b/test/mscclpp-test/deploy/run_tests.sh @@ -0,0 +1,16 @@ +set -e + +echo "=================Run allgather_test_perf on 2 nodes=========================" +/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 + +# For kernel 2, the message size must can be divided by 3 +/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 + +echo "==================Run alltoall_test_perf on 2 nodes=========================" +/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \ + -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \ + -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 diff --git a/test/mscclpp-test/deploy/setup.sh b/test/mscclpp-test/deploy/setup.sh new file mode 100644 index 00000000..105b6552 --- /dev/null +++ b/test/mscclpp-test/deploy/setup.sh @@ -0,0 +1,15 @@ +set -e + +mkdir -p /root/.ssh +mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys +chown root:root /root/.ssh/authorized_keys +mv /root/mscclpp/config /root/.ssh/config +chown root:root /root/.ssh/config +chmod 400 /root/mscclpp/sshkey +chown root:root /root/mscclpp/sshkey + +apt-get update -y +apt-get install openssh-server -y + +mkdir -p /var/run/sshd +/usr/sbin/sshd -p 22345