Create Azure pipeline for multi-node tests (#97)

Create Azure pipeline to run mscclpp-test on multi-nodes
This commit is contained in:
Binyang2014
2023-06-13 14:34:07 +08:00
committed by GitHub
parent 76718e4015
commit 6ee4e80317
7 changed files with 165 additions and 0 deletions

View File

@@ -0,0 +1,84 @@
trigger:
- main
# Do not run multi-nodes-test for PR, we can trigger it manually
pr: none
pool:
name: mscclpp-it
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ssh.key
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: mscclpp
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclpp-it-vmss --resource-group msccl-dev
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/mscclpp-test/deploy/deploy.sh
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunTests
displayName: Run multi-nodes test
inputs:
targetType: 'inline'
script: |
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
parallel-ssh -i -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: mscclpp
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev

View File

@@ -0,0 +1,8 @@
Host mscclpp-it-000000
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host mscclpp-it-000001
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no

View File

@@ -0,0 +1,38 @@
set -e
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/build"
DST_DIR="/tmp/mscclpp"
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy/hostfile"
DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy"
SSH_OPTION="StrictHostKeyChecking=no"
chmod 400 ${KeyFilePath}
ssh-keygen -t rsa -f sshkey -P ""
while true; do
set +e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
if [ $? -eq 0 ]; then
break
fi
echo "Waiting for sshd to start..."
sleep 5
done
set -e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "rm -rf ${DST_DIR}"
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "mkdir -p ${DST_DIR}"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_DIR} ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR}
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
-w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \
--entrypoint /bin/bash ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1"
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'"

View File

@@ -0,0 +1,2 @@
azureuser@mscclpp-it-000000
azureuser@mscclpp-it-000001

View File

@@ -0,0 +1,2 @@
mscclpp-it-000000
mscclpp-it-000001

View File

@@ -0,0 +1,16 @@
set -e
echo "=================Run allgather_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0
# For kernel 2, the message size must can be divided by 3
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2
echo "==================Run alltoall_test_perf on 2 nodes========================="
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0

View File

@@ -0,0 +1,15 @@
set -e
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys
mv /root/mscclpp/config /root/.ssh/config
chown root:root /root/.ssh/config
chmod 400 /root/mscclpp/sshkey
chown root:root /root/mscclpp/sshkey
apt-get update -y
apt-get install openssh-server -y
mkdir -p /var/run/sshd
/usr/sbin/sshd -p 22345