az pipeline refactoring

This commit is contained in:
Changho Hwang
2026-03-07 02:23:30 +00:00
parent 00583da21b
commit c699b8a784
18 changed files with 466 additions and 1042 deletions

View File

@@ -47,7 +47,6 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: CodeCoverageH100
@@ -69,7 +68,6 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: CodeCoverageMI300X
@@ -91,6 +89,5 @@ jobs:
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
platform: rocm
gpuArch: gfx942

View File

@@ -45,7 +45,6 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: IntegrationTestH100
@@ -65,6 +64,5 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -37,33 +37,6 @@ jobs:
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp-ssh.key
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: Bash@3
displayName: Add HostEntry
inputs:
@@ -77,23 +50,11 @@ jobs:
echo "Entry already exists, nothing to do."
fi
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclit-vmss --resource-group msccl-IT
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT
- task: Bash@3
name: RunMscclppTest
@@ -101,18 +62,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
kill $CHILD_PID
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
"bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test"
- task: Bash@3
name: RunMultiNodeUnitTest
@@ -120,18 +71,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
kill $CHILD_PID
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
"bash /root/mscclpp/test/deploy/run_tests.sh mp-ut"
- task: Bash@3
name: RunMultiNodePythonTests
@@ -139,18 +80,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
kill $CHILD_PID
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
"bash /root/mscclpp/test/deploy/run_tests.sh pytests"
- task: Bash@3
name: RunMultiNodePythonBenchmark
@@ -158,26 +89,11 @@ jobs:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
kill $CHILD_PID
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
"bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark"
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
- template: templates/stop.yaml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT

View File

@@ -44,7 +44,6 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
- job: NcclTestH100
@@ -65,5 +64,4 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_90,code=sm_90"

View File

@@ -44,5 +44,4 @@ jobs:
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
gpuArch: gfx942

View File

@@ -3,8 +3,6 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: platform
type: string
default: 'cuda'
@@ -12,57 +10,17 @@ parameters:
type: string
steps:
- task: Bash@3
name: BuildCoverage
displayName: Build with coverage
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
else
cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
fi
make -j
cd ..
pwd > build/BUILD_PREFIX
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true ${{ parameters.platform }}"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
buildType: Debug
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
buildDisplayName: 'Build with coverage'
buildName: BuildCoverage
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- task: Bash@3
name: TestsCoverageNonPerf
@@ -70,30 +28,26 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
export GCOV_PREFIX=/root/mscclpp; \
export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
cd build; \
lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent; \
lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info; \
lcov --list coverage.info"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
export GCOV_PREFIX=/root/mscclpp; \
export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
./build/bin/unit_tests; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
lcov --version; \
LCOV_CAPTURE_ARGS=""; \
if lcov --help 2>&1 | grep -q "inconsistent"; then \
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"; \
fi; \
lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}; \
if [ ! -s coverage.info ]; then \
echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."; \
exit 1; \
fi; \
lcov --extract coverage.info "\${BUILD_PREFIX}/src/*" "\${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info; \
lcov --list coverage.info'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -124,13 +78,7 @@ steps:
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,127 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
# Build parameters
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
default: ''
- name: buildType
type: string
default: 'Release'
- name: buildTests
type: boolean
default: true
- name: cmakeArgs
type: string
default: ''
- name: buildName
type: string
default: 'Build'
- name: buildDisplayName
type: string
default: 'Build'
# Deploy parameters
- name: deployArgs
type: string
default: ''
steps:
# 1. Check VMSS availability (fast, fail-fast)
- task: AzureCLI@2
name: CheckVMSS
displayName: Check VMSS Availability
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
set -e
INSTANCES=$(az vmss list-instances --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -o json)
COUNT=$(echo "$INSTANCES" | jq 'length')
if [ "$COUNT" -eq 0 ]; then
echo "##vso[task.logissue type=error]No VMSS instances found for ${{ parameters.vmssName }}"
exit 1
fi
FAILED=$(echo "$INSTANCES" | jq '[.[] | select(.provisioningState == "Failed")] | length')
if [ "$FAILED" -gt 0 ]; then
echo "##vso[task.logissue type=error]$FAILED VMSS instance(s) in Failed state"
exit 1
fi
echo "VMSS ${{ parameters.vmssName }}: $COUNT instance(s) available"
# 2. Build
- task: Bash@3
name: ${{ parameters.buildName }}
displayName: ${{ parameters.buildDisplayName }}
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
${{ if eq(parameters.platform, 'rocm') }}
CXX=/opt/rocm/bin/hipcc cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_ROCM=ON \
${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
${{ parameters.cmakeArgs }} ..
${{ else }}
cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
${{ parameters.cmakeArgs }} ..
${{ endif }}
make -j
cd ..
pwd > build/BUILD_PREFIX
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 3. Download SSH key + install packages + start VMSS
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
# 4. Deploy test environment
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: ${{ parameters.deployArgs }}
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -3,8 +3,6 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
@@ -12,51 +10,12 @@ parameters:
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
- task: Bash@3
name: AllGatherTest
@@ -64,24 +23,12 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -90,21 +37,9 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -113,27 +48,15 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -142,21 +65,10 @@ steps:
inputs:
targetType: 'inline'
script: |
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
test/deploy/run-remote.sh '\
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
kill $CHILD_PID
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -165,21 +77,9 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
set -e; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -188,55 +88,13 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
set -e; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: FifoPerfBenchmark
displayName: FIFO Performance Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
./build/bin/perf/fifo_test"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -4,99 +4,22 @@
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# sshKeySecureFile the secureFile name for your SSH key
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: nccltest-single-node
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: GenerateExecutionFile
# displayName: Generate execution file
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp/msccl-users; \
# mkdir -p execution-files; \
# cd /root/mscclpp/msccl-users; \
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
deployArgs: 'nccltest-single-node'
- task: Bash@3
name: InstallNcclTests
@@ -104,85 +27,22 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
test/deploy/run-remote.sh '\
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllReduceTest
# displayName: Run NCCL AllReduce Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllGatherTest
# displayName: Run NCCL AllGather Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterTest
# displayName: Run NCCL Reduce Scatter Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNccl
displayName: Install NCCL
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl.git; \
cd nccl; \
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
test/deploy/run-remote.sh '\
cd; git clone https://github.com/NVIDIA/nccl.git; \
cd nccl; \
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -191,19 +51,9 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
test/deploy/run-remote.sh '\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -212,19 +62,9 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
test/deploy/run-remote.sh '\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -233,48 +73,12 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
test/deploy/run-remote.sh '\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterFallbaclkToNcclTest
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -5,7 +5,6 @@
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# sshKeySecureFile the secureFile name for your SSH key
# gpuArch GPU architecture (e.g. gfx942)
parameters:
@@ -13,56 +12,19 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true rocm"
workingDirectory: $(System.DefaultWorkingDirectory)
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: rocm
gpuArch: ${{ parameters.gpuArch }}
buildTests: false
deployArgs: 'single-node-test true rocm'
- task: Bash@3
@@ -71,21 +33,15 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; \
test/deploy/run-remote.sh '\
cd; \
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
cd rocm-systems; \
git sparse-checkout init --cone; \
git sparse-checkout set projects/rccl-tests; \
git checkout; \
cd projects/rccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
cd rocm-systems; \
git sparse-checkout init --cone; \
git sparse-checkout set projects/rccl-tests; \
git checkout; \
cd projects/rccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -94,19 +50,9 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
test/deploy/run-remote.sh '\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -115,28 +61,12 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
test/deploy/run-remote.sh '\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,20 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
steps:
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}

View File

@@ -3,57 +3,17 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
deployArgs: 'single-node-test false'
- task: Bash@3
name: UnitTests
@@ -61,19 +21,8 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
./build/bin/unit_tests'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -82,22 +31,10 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -106,20 +43,8 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -128,11 +53,7 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
test/deploy/run-remote.sh --no-docker --no-log \
"sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -143,8 +64,15 @@ steps:
inputs:
targetType: 'inline'
script: |
rm -rf build && mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
set -e
rm -rf build
mkdir -p build && cd build
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
-DMSCCLPP_BUILD_TESTS=ON \
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
@@ -163,29 +91,11 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -3,70 +3,18 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
mkdir -p build && cd build; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
make -j"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
deployArgs: 'single-node-test'
- task: Bash@3
name: MpUnitTests
@@ -74,27 +22,15 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
test/deploy/run-remote.sh '\
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter=\"ExecutorTest.TwoNodesAllreduce\"; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -103,43 +39,25 @@ steps:
inputs:
targetType: 'inline'
script: |
# set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
test/deploy/run-remote.sh '\
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce.json'"'"'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce_packet.json'"'"'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -3,8 +3,6 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: platform
type: string
default: 'cuda'
@@ -12,55 +10,13 @@ parameters:
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
else
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
fi
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true ${{ parameters.platform }}"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/deploy.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- task: Bash@3
@@ -69,19 +25,8 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
./build/bin/unit_tests'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -90,22 +35,10 @@ steps:
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
@@ -114,29 +47,11 @@ steps:
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
test/deploy/run-remote.sh '\
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: templates/stop.yaml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -47,7 +47,6 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestWithNpKitA100
@@ -69,7 +68,6 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestH100
@@ -89,7 +87,6 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestWithNpKitH100
@@ -109,7 +106,6 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestNoIBEnv
@@ -131,7 +127,6 @@ jobs:
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestMI300X
@@ -151,6 +146,5 @@ jobs:
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
platform: rocm
gpuArch: gfx942

View File

@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```

96
test/deploy/run-remote.sh Executable file
View File

@@ -0,0 +1,96 @@
#!/bin/bash
# Run a command on remote CI VMs via parallel-ssh.
# By default, runs inside the mscclpp-test docker container.
#
# Usage:
# run-remote.sh [OPTIONS] <command>
#
# Options:
# --no-docker Run command directly on the host, not inside docker
# --no-log Don't tail the log file in the background
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
# --host Run command on a single host (uses parallel-ssh -H)
# --user SSH user when using --host or custom hostfile
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
USE_DOCKER=true
USE_LOG=true
TARGET_HOST=""
REMOTE_USER=""
while [[ "$1" == --* ]]; do
case "$1" in
--no-docker) USE_DOCKER=false; shift ;;
--no-log) USE_LOG=false; shift ;;
--hostfile)
if [ -z "$2" ]; then
echo "Missing value for --hostfile" >&2
exit 1
fi
HOSTFILE="$2"
shift 2
;;
--host)
if [ -z "$2" ]; then
echo "Missing value for --host" >&2
exit 1
fi
TARGET_HOST="$2"
shift 2
;;
--user)
if [ -z "$2" ]; then
echo "Missing value for --user" >&2
exit 1
fi
REMOTE_USER="$2"
shift 2
;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
if [ $# -eq 0 ]; then
echo "Usage: $0 [--no-docker] [--no-log] <command>" >&2
exit 1
fi
CMD="$*"
PSSH_TARGET_ARGS=()
if [ -n "$TARGET_HOST" ]; then
PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
else
PSSH_TARGET_ARGS=(-h "$HOSTFILE")
fi
PSSH_USER_ARGS=()
if [ -n "$REMOTE_USER" ]; then
PSSH_USER_ARGS=(-l "$REMOTE_USER")
fi
if $USE_LOG; then
if [ -n "$TARGET_HOST" ]; then
HOST="$TARGET_HOST"
else
HOST=$(head -1 "${HOSTFILE}")
HOST="${HOST##*@}"
fi
: > "${HOST}"
tail -f "${HOST}" &
CHILD_PID=$!
trap "kill $CHILD_PID 2>/dev/null" EXIT
fi
if $USE_DOCKER; then
parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
-O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; ${CMD}\""
else
parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
-O "$SSH_OPTION" "set -ex; ${CMD}"
fi

View File

@@ -1,6 +1,5 @@
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
export PATH=/usr/local/mpi/bin:$PATH
function run_mscclpp_test()
{