mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
az pipeline refactoring
This commit is contained in:
@@ -47,7 +47,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: CodeCoverageH100
|
||||
@@ -69,7 +68,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: CodeCoverageMI300X
|
||||
@@ -91,6 +89,5 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
|
||||
@@ -45,7 +45,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: IntegrationTestH100
|
||||
@@ -65,6 +64,5 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -37,33 +37,6 @@ jobs:
|
||||
image: $[ variables['containerImage'] ]
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: mscclpp-ssh.key
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: Bash@3
|
||||
displayName: Add HostEntry
|
||||
inputs:
|
||||
@@ -77,23 +50,11 @@ jobs:
|
||||
echo "Entry already exists, nothing to do."
|
||||
fi
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: msccl-it
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name mscclit-vmss --resource-group msccl-IT
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
resourceGroup: msccl-IT
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMscclppTest
|
||||
@@ -101,18 +62,8 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
|
||||
"bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodeUnitTest
|
||||
@@ -120,18 +71,8 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
|
||||
"bash /root/mscclpp/test/deploy/run_tests.sh mp-ut"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodePythonTests
|
||||
@@ -139,18 +80,8 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
|
||||
"bash /root/mscclpp/test/deploy/run_tests.sh pytests"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodePythonBenchmark
|
||||
@@ -158,26 +89,11 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh --hostfile "$(System.DefaultWorkingDirectory)/test/deploy/hostfile" --host mscclit-000000 --user azureuser \
|
||||
"bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark"
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: msccl-it
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
resourceGroup: msccl-IT
|
||||
|
||||
@@ -44,7 +44,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
- job: NcclTestH100
|
||||
@@ -65,5 +64,4 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
nvccGencode: "-gencode=arch=compute_90,code=sm_90"
|
||||
@@ -44,5 +44,4 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: gfx942
|
||||
|
||||
@@ -3,8 +3,6 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
@@ -12,57 +10,17 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: BuildCoverage
|
||||
displayName: Build with coverage
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
if [ "${{ parameters.platform }}" == "rocm" ]; then
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
|
||||
else
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_ENABLE_COVERAGE=ON ..
|
||||
fi
|
||||
make -j
|
||||
cd ..
|
||||
pwd > build/BUILD_PREFIX
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test true ${{ parameters.platform }}"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
buildType: Debug
|
||||
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
|
||||
buildDisplayName: 'Build with coverage'
|
||||
buildName: BuildCoverage
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
- task: Bash@3
|
||||
name: TestsCoverageNonPerf
|
||||
@@ -70,30 +28,26 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
cd /root/mscclpp; \
|
||||
BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
|
||||
STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
|
||||
export GCOV_PREFIX=/root/mscclpp; \
|
||||
export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
./build/bin/unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
|
||||
cd build; \
|
||||
lcov --directory . --capture --output-file coverage.info --ignore-errors inconsistent; \
|
||||
lcov --extract coverage.info \"\${BUILD_PREFIX}/src/*\" \"\${BUILD_PREFIX}/include/mscclpp/*\" --output-file coverage.info; \
|
||||
lcov --list coverage.info"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
BUILD_PREFIX=\$(cat build/BUILD_PREFIX); \
|
||||
STRIP_COUNT=\$(echo \$BUILD_PREFIX | tr -cd / | wc -c); \
|
||||
export GCOV_PREFIX=/root/mscclpp; \
|
||||
export GCOV_PREFIX_STRIP=\$STRIP_COUNT; \
|
||||
./build/bin/unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests; \
|
||||
lcov --version; \
|
||||
LCOV_CAPTURE_ARGS=""; \
|
||||
if lcov --help 2>&1 | grep -q "inconsistent"; then \
|
||||
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"; \
|
||||
fi; \
|
||||
lcov --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}; \
|
||||
if [ ! -s coverage.info ]; then \
|
||||
echo "ERROR: coverage.info was not generated. Tests may have failed before coverage capture or produced no gcov data."; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
lcov --extract coverage.info "\${BUILD_PREFIX}/src/*" "\${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info; \
|
||||
lcov --list coverage.info'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -124,13 +78,7 @@ steps:
|
||||
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
127
.azure-pipelines/templates/deploy.yaml
Normal file
127
.azure-pipelines/templates/deploy.yaml
Normal file
@@ -0,0 +1,127 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: resourceGroup
|
||||
type: string
|
||||
default: mscclpp
|
||||
# Build parameters
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildType
|
||||
type: string
|
||||
default: 'Release'
|
||||
- name: buildTests
|
||||
type: boolean
|
||||
default: true
|
||||
- name: cmakeArgs
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildName
|
||||
type: string
|
||||
default: 'Build'
|
||||
- name: buildDisplayName
|
||||
type: string
|
||||
default: 'Build'
|
||||
# Deploy parameters
|
||||
- name: deployArgs
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
steps:
|
||||
# 1. Check VMSS availability (fast, fail-fast)
|
||||
- task: AzureCLI@2
|
||||
name: CheckVMSS
|
||||
displayName: Check VMSS Availability
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
set -e
|
||||
INSTANCES=$(az vmss list-instances --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -o json)
|
||||
COUNT=$(echo "$INSTANCES" | jq 'length')
|
||||
if [ "$COUNT" -eq 0 ]; then
|
||||
echo "##vso[task.logissue type=error]No VMSS instances found for ${{ parameters.vmssName }}"
|
||||
exit 1
|
||||
fi
|
||||
FAILED=$(echo "$INSTANCES" | jq '[.[] | select(.provisioningState == "Failed")] | length')
|
||||
if [ "$FAILED" -gt 0 ]; then
|
||||
echo "##vso[task.logissue type=error]$FAILED VMSS instance(s) in Failed state"
|
||||
exit 1
|
||||
fi
|
||||
echo "VMSS ${{ parameters.vmssName }}: $COUNT instance(s) available"
|
||||
|
||||
# 2. Build
|
||||
- task: Bash@3
|
||||
name: ${{ parameters.buildName }}
|
||||
displayName: ${{ parameters.buildDisplayName }}
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
rm -rf build
|
||||
mkdir -p build && cd build
|
||||
${{ if eq(parameters.platform, 'rocm') }}
|
||||
CXX=/opt/rocm/bin/hipcc cmake \
|
||||
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_ROCM=ON \
|
||||
${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
|
||||
${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
|
||||
${{ parameters.cmakeArgs }} ..
|
||||
${{ else }}
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_CUDA=ON \
|
||||
${{ if parameters.buildTests }}-DMSCCLPP_BUILD_TESTS=ON${{ endif }} \
|
||||
${{ if ne(parameters.gpuArch, '') }}-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}${{ endif }} \
|
||||
${{ parameters.cmakeArgs }} ..
|
||||
${{ endif }}
|
||||
make -j
|
||||
cd ..
|
||||
pwd > build/BUILD_PREFIX
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# 3. Download SSH key + install packages + start VMSS
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: mscclpp.pem
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
|
||||
|
||||
# 4. Deploy test environment
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: ${{ parameters.deployArgs }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
@@ -3,8 +3,6 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
@@ -12,51 +10,12 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllGatherTest
|
||||
@@ -64,24 +23,12 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -90,21 +37,9 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -113,27 +48,15 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -142,21 +65,10 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -165,21 +77,9 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -188,55 +88,13 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 -m pip install .; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
set -e; \
|
||||
python3 -m pip install .; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: FifoPerfBenchmark
|
||||
displayName: FIFO Performance Benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
./build/bin/perf/fifo_test"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -4,99 +4,22 @@
|
||||
#
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
# sshKeySecureFile – the secureFile name for your SSH key
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: nvccGencode
|
||||
type: string
|
||||
default: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
- checkout: git://One/msccl-users
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: mscclpp/test/deploy/deploy.sh
|
||||
arguments: nccltest-single-node
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: CopyMscclUsers
|
||||
displayName: Copy msccl-users
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
DST_DIR="/tmp/mscclpp/msccl-users"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: GenerateExecutionFile
|
||||
# displayName: Generate execution file
|
||||
# inputs:
|
||||
# targetType: 'inline'
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp/msccl-users; \
|
||||
# mkdir -p execution-files; \
|
||||
# cd /root/mscclpp/msccl-users; \
|
||||
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
deployArgs: 'nccltest-single-node'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallNcclTests
|
||||
@@ -104,85 +27,22 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
|
||||
cd nccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
|
||||
test/deploy/run-remote.sh '\
|
||||
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
|
||||
cd nccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclAllReduceTest
|
||||
# displayName: Run NCCL AllReduce Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclAllGatherTest
|
||||
# displayName: Run NCCL AllGather Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclReduceScatterTest
|
||||
# displayName: Run NCCL Reduce Scatter Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallNccl
|
||||
displayName: Install NCCL
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd; git clone https://github.com/NVIDIA/nccl.git; \
|
||||
cd nccl; \
|
||||
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
|
||||
test/deploy/run-remote.sh '\
|
||||
cd; git clone https://github.com/NVIDIA/nccl.git; \
|
||||
cd nccl; \
|
||||
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -191,19 +51,9 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -212,19 +62,9 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -233,48 +73,12 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclReduceScatterFallbaclkToNcclTest
|
||||
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
|
||||
# inputs:
|
||||
# targetType: 'inline'
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
# vmssName – VMSS name to start/stop
|
||||
# sshKeySecureFile – the secureFile name for your SSH key
|
||||
# gpuArch – GPU architecture (e.g. gfx942)
|
||||
|
||||
parameters:
|
||||
@@ -13,56 +12,19 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: "gfx942"
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test true rocm"
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: rocm
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
buildTests: false
|
||||
deployArgs: 'single-node-test true rocm'
|
||||
|
||||
|
||||
- task: Bash@3
|
||||
@@ -71,21 +33,15 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd; \
|
||||
test/deploy/run-remote.sh '\
|
||||
cd; \
|
||||
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
|
||||
cd rocm-systems; \
|
||||
git sparse-checkout init --cone; \
|
||||
git sparse-checkout set projects/rccl-tests; \
|
||||
git checkout; \
|
||||
cd projects/rccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
|
||||
cd rocm-systems; \
|
||||
git sparse-checkout init --cone; \
|
||||
git sparse-checkout set projects/rccl-tests; \
|
||||
git checkout; \
|
||||
cd projects/rccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -94,19 +50,9 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -115,28 +61,12 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
20
.azure-pipelines/templates/stop.yaml
Normal file
20
.azure-pipelines/templates/stop.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: resourceGroup
|
||||
type: string
|
||||
default: mscclpp
|
||||
|
||||
steps:
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
|
||||
@@ -3,57 +3,17 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: single-node-test false
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
|
||||
deployArgs: 'single-node-test false'
|
||||
|
||||
- task: Bash@3
|
||||
name: UnitTests
|
||||
@@ -61,19 +21,8 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
./build/bin/unit_tests"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
./build/bin/unit_tests'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -82,22 +31,10 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -106,20 +43,8 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -128,11 +53,7 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
test/deploy/run-remote.sh --no-docker --no-log \
|
||||
"sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
|
||||
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
@@ -143,8 +64,15 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
rm -rf build && mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
set -e
|
||||
rm -rf build
|
||||
mkdir -p build && cd build
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_CUDA=ON \
|
||||
-DMSCCLPP_BUILD_TESTS=ON \
|
||||
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -163,29 +91,11 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
@@ -3,70 +3,18 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
|
||||
steps:
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
mkdir -p build && cd build; \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
|
||||
make -j"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- task: Bash@3
|
||||
name: MpUnitTests
|
||||
@@ -74,27 +22,15 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
test/deploy/run-remote.sh '\
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter=\"ExecutorTest.TwoNodesAllreduce\"; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
|
||||
kill $CHILD_PID
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -103,43 +39,25 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
# set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
test/deploy/run-remote.sh '\
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce.json'"'"'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
|
||||
kill $CHILD_PID
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k '"'"'test_executor[allreduce_packet.json'"'"'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
@@ -3,8 +3,6 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
@@ -12,55 +10,13 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
if [ "${{ parameters.platform }}" == "rocm" ]; then
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
else
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
fi
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test true ${{ parameters.platform }}"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: templates/deploy.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
|
||||
- task: Bash@3
|
||||
@@ -69,19 +25,8 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
./build/bin/unit_tests"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
./build/bin/unit_tests'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -90,22 +35,10 @@ steps:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
@@ -114,29 +47,11 @@ steps:
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
test/deploy/run-remote.sh '\
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- template: templates/stop.yaml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
@@ -47,7 +47,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: UnitTestWithNpKitA100
|
||||
@@ -69,7 +68,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: UnitTestH100
|
||||
@@ -89,7 +87,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestWithNpKitH100
|
||||
@@ -109,7 +106,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestNoIBEnv
|
||||
@@ -131,7 +127,6 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestMI300X
|
||||
@@ -151,6 +146,5 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
|
||||
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
|
||||
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
|
||||
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
|
||||
|
||||
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
|
||||
96
test/deploy/run-remote.sh
Executable file
96
test/deploy/run-remote.sh
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
# Run a command on remote CI VMs via parallel-ssh.
|
||||
# By default, runs inside the mscclpp-test docker container.
|
||||
#
|
||||
# Usage:
|
||||
# run-remote.sh [OPTIONS] <command>
|
||||
#
|
||||
# Options:
|
||||
# --no-docker Run command directly on the host, not inside docker
|
||||
# --no-log Don't tail the log file in the background
|
||||
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
|
||||
# --host Run command on a single host (uses parallel-ssh -H)
|
||||
# --user SSH user when using --host or custom hostfile
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
|
||||
|
||||
USE_DOCKER=true
|
||||
USE_LOG=true
|
||||
TARGET_HOST=""
|
||||
REMOTE_USER=""
|
||||
|
||||
while [[ "$1" == --* ]]; do
|
||||
case "$1" in
|
||||
--no-docker) USE_DOCKER=false; shift ;;
|
||||
--no-log) USE_LOG=false; shift ;;
|
||||
--hostfile)
|
||||
if [ -z "$2" ]; then
|
||||
echo "Missing value for --hostfile" >&2
|
||||
exit 1
|
||||
fi
|
||||
HOSTFILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--host)
|
||||
if [ -z "$2" ]; then
|
||||
echo "Missing value for --host" >&2
|
||||
exit 1
|
||||
fi
|
||||
TARGET_HOST="$2"
|
||||
shift 2
|
||||
;;
|
||||
--user)
|
||||
if [ -z "$2" ]; then
|
||||
echo "Missing value for --user" >&2
|
||||
exit 1
|
||||
fi
|
||||
REMOTE_USER="$2"
|
||||
shift 2
|
||||
;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 [--no-docker] [--no-log] <command>" >&2
|
||||
exit 1
|
||||
fi
|
||||
CMD="$*"
|
||||
|
||||
PSSH_TARGET_ARGS=()
|
||||
if [ -n "$TARGET_HOST" ]; then
|
||||
PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
|
||||
else
|
||||
PSSH_TARGET_ARGS=(-h "$HOSTFILE")
|
||||
fi
|
||||
|
||||
PSSH_USER_ARGS=()
|
||||
if [ -n "$REMOTE_USER" ]; then
|
||||
PSSH_USER_ARGS=(-l "$REMOTE_USER")
|
||||
fi
|
||||
|
||||
if $USE_LOG; then
|
||||
if [ -n "$TARGET_HOST" ]; then
|
||||
HOST="$TARGET_HOST"
|
||||
else
|
||||
HOST=$(head -1 "${HOSTFILE}")
|
||||
HOST="${HOST##*@}"
|
||||
fi
|
||||
: > "${HOST}"
|
||||
tail -f "${HOST}" &
|
||||
CHILD_PID=$!
|
||||
trap "kill $CHILD_PID 2>/dev/null" EXIT
|
||||
fi
|
||||
|
||||
if $USE_DOCKER; then
|
||||
parallel-ssh -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" -o . \
|
||||
-O "$SSH_OPTION" "sudo docker exec -t mscclpp-test bash -c \"set -ex; pushd /root/mscclpp >/dev/null; trap 'popd >/dev/null' EXIT; ${CMD}\""
|
||||
else
|
||||
parallel-ssh -i -t 0 "${PSSH_TARGET_ARGS[@]}" "${PSSH_USER_ARGS[@]}" -x "-i ${KeyFilePath}" \
|
||||
-O "$SSH_OPTION" "set -ex; ${CMD}"
|
||||
fi
|
||||
@@ -1,6 +1,5 @@
|
||||
set -e
|
||||
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
|
||||
function run_mscclpp_test()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user