mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
Create Azure pipeline for multi-node tests (#97)
Create Azure pipeline to run mscclpp-test on multi-nodes
This commit is contained in:
84
.azure-pipelines/multi-nodes-test.yml
Normal file
84
.azure-pipelines/multi-nodes-test.yml
Normal file
@@ -0,0 +1,84 @@
|
||||
trigger:
|
||||
- main
|
||||
|
||||
# Do not run multi-nodes-test for PR, we can trigger it manually
|
||||
pr: none
|
||||
|
||||
pool:
|
||||
name: mscclpp-it
|
||||
container:
|
||||
image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
|
||||
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
|
||||
mkdir build && cd build
|
||||
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ssh.key
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: mscclpp
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name mscclpp-it-vmss --resource-group msccl-dev
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/mscclpp-test/deploy/deploy.sh
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunTests
|
||||
displayName: Run multi-nodes test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
mkdir -p output
|
||||
touch output/mscclpp-it-000000
|
||||
tail -f output/mscclpp-it-000000 &
|
||||
parallel-ssh -i -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: mscclpp
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev
|
||||
8
test/mscclpp-test/deploy/config
Normal file
8
test/mscclpp-test/deploy/config
Normal file
@@ -0,0 +1,8 @@
|
||||
Host mscclpp-it-000000
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host mscclpp-it-000001
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
38
test/mscclpp-test/deploy/deploy.sh
Normal file
38
test/mscclpp-test/deploy/deploy.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
set -e
|
||||
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/build"
|
||||
DST_DIR="/tmp/mscclpp"
|
||||
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy/hostfile"
|
||||
DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test/deploy"
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
|
||||
chmod 400 ${KeyFilePath}
|
||||
ssh-keygen -t rsa -f sshkey -P ""
|
||||
|
||||
while true; do
|
||||
set +e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
|
||||
if [ $? -eq 0 ]; then
|
||||
break
|
||||
fi
|
||||
echo "Waiting for sshd to start..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
set -e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "rm -rf ${DST_DIR}"
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "mkdir -p ${DST_DIR}"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_DIR} ${DST_DIR}
|
||||
|
||||
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR}
|
||||
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR}
|
||||
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR}
|
||||
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1"
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'"
|
||||
|
||||
2
test/mscclpp-test/deploy/hostfile
Normal file
2
test/mscclpp-test/deploy/hostfile
Normal file
@@ -0,0 +1,2 @@
|
||||
azureuser@mscclpp-it-000000
|
||||
azureuser@mscclpp-it-000001
|
||||
2
test/mscclpp-test/deploy/hostfile_mpi
Normal file
2
test/mscclpp-test/deploy/hostfile_mpi
Normal file
@@ -0,0 +1,2 @@
|
||||
mscclpp-it-000000
|
||||
mscclpp-it-000001
|
||||
16
test/mscclpp-test/deploy/run_tests.sh
Normal file
16
test/mscclpp-test/deploy/run_tests.sh
Normal file
@@ -0,0 +1,16 @@
|
||||
set -e
|
||||
|
||||
echo "=================Run allgather_test_perf on 2 nodes========================="
|
||||
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0
|
||||
|
||||
# For kernel 2, the message size must can be divided by 3
|
||||
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2
|
||||
|
||||
echo "==================Run alltoall_test_perf on 2 nodes========================="
|
||||
/usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0
|
||||
15
test/mscclpp-test/deploy/setup.sh
Normal file
15
test/mscclpp-test/deploy/setup.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
set -e
|
||||
|
||||
mkdir -p /root/.ssh
|
||||
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
||||
chown root:root /root/.ssh/authorized_keys
|
||||
mv /root/mscclpp/config /root/.ssh/config
|
||||
chown root:root /root/.ssh/config
|
||||
chmod 400 /root/mscclpp/sshkey
|
||||
chown root:root /root/mscclpp/sshkey
|
||||
|
||||
apt-get update -y
|
||||
apt-get install openssh-server -y
|
||||
|
||||
mkdir -p /var/run/sshd
|
||||
/usr/sbin/sshd -p 22345
|
||||
Reference in New Issue
Block a user