Remove GTest dependency, add code coverage, and refactor unit tests and CI pipelines (#744)

- Removes the GTest dependency, replacing it with a minimal custom
framework (`test/framework.*`) that covers only what the tests actually
use — a unified `TEST()` macro with SFINAE-based fixture auto-detection,
`EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown.
- `--exclude-perf-tests` flag and substring-based negative filtering
- `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to
Codecov
- Merges standalone `test/perf/` into main test targets
- Refactors Azure pipelines to reduce redundancies & make more readable

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
This commit is contained in:
Copilot
2026-03-24 23:34:38 -04:00
committed by GitHub
parent 5d18835417
commit 93f6eeaa6b
68 changed files with 2116 additions and 2416 deletions

View File

@@ -0,0 +1,93 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: CodeCoverageA100
timeoutInMinutes: 40
pool:
name: msccl-ci
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
- job: CodeCoverageH100
timeoutInMinutes: 40
pool:
name: msccl-ci-h100
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'
- job: CodeCoverageMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
variables:
- group: mscclpp
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942

View File

@@ -41,11 +41,10 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -37,33 +37,6 @@ jobs:
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp-ssh.key
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: Bash@3
displayName: Add HostEntry
inputs:
@@ -77,107 +50,46 @@ jobs:
echo "Entry already exists, nothing to do."
fi
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclit-vmss --resource-group msccl-IT
- template: templates/deploy.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/run-remote-task.yml
parameters:
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
- task: Bash@3
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
- task: Bash@3
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests
- task: Bash@3
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
- task: Bash@3
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
kill $CHILD_PID
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
- template: templates/stop.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT

View File

@@ -40,11 +40,10 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
- job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_90,code=sm_90"

View File

@@ -40,9 +40,8 @@ jobs:
image: $(containerImage)
steps:
- template: templates/rccl-test.yaml
- template: templates/rccl-test.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
gpuArch: gfx942

View File

@@ -0,0 +1,110 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
buildType: Debug
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
buildDisplayName: 'Build with coverage'
buildName: BuildCoverage
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: TestsCoverageNonPerf
displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
export GCOV_PREFIX=/root/mscclpp
export GCOV_PREFIX_STRIP=$STRIP_COUNT
echo "Running unit_tests..."
./build/bin/unit_tests
echo "unit_tests: PASSED"
echo "Running mp_unit_tests -np 2..."
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 2: PASSED"
echo "Running mp_unit_tests -np 4..."
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 4: PASSED"
- template: run-remote-task.yml
parameters:
name: CaptureCoverage
displayName: Capture coverage data with lcov
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
GCOV_TOOL_ARG=""
if [ "${{ parameters.platform }}" = "rocm" ]; then
apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
GCOV_WRAPPER=$(mktemp)
printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
chmod +x "$GCOV_WRAPPER"
GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
fi
lcov --version
LCOV_CAPTURE_ARGS=""
if lcov --help 2>&1 | grep -q "inconsistent"; then
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
fi
lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
if [ ! -s coverage.info ]; then
echo "ERROR: coverage.info was not generated."
exit 1
fi
lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
lcov --list coverage.info
ls -la coverage.info
- task: Bash@3
name: FetchCoverage
displayName: Fetch coverage data from remote VM
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
HOST=$(head -1 ${HOSTFILE})
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UploadCodecov
displayName: Upload coverage to Codecov
inputs:
targetType: 'inline'
script: |
set -e
curl -Os https://cli.codecov.io/latest/linux/codecov
chmod +x codecov
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,131 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
# Build parameters
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
default: ''
- name: buildType
type: string
default: 'Release'
- name: buildTests
type: string
default: 'true'
- name: cmakeArgs
type: string
default: ''
- name: buildName
type: string
default: 'Build'
- name: buildDisplayName
type: string
default: 'Build'
# Deploy parameters
- name: deployArgs
type: string
default: ''
steps:
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
- task: Bash@3
name: EnsureAzureCLI
displayName: Ensure Azure CLI Installed
inputs:
targetType: inline
script: |
set -e
if command -v az >/dev/null 2>&1; then
az version >/dev/null
exit 0
fi
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
# 1. Build
- task: Bash@3
name: ${{ parameters.buildName }}
displayName: ${{ parameters.buildDisplayName }}
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
BUILD_TESTS_ARG=""
if [ "${{ parameters.buildTests }}" = "true" ]; then
BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
fi
GPU_ARCH_ARG=""
if [ -n "${{ parameters.gpuArch }}" ]; then
GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
fi
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
if [ "${{ parameters.platform }}" = "rocm" ]; then
eval CXX=/opt/rocm/bin/hipcc cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_ROCM=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
else
eval cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
fi
make -j
cd ..
pwd > build/BUILD_PREFIX
echo "=== Build artifacts ==="
ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
du -sh build/bin/* 2>/dev/null || true
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 2. Download SSH key + install packages + start VMSS
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
# 3. Deploy test environment
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: ${{ parameters.deployArgs }}
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -1,242 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllGatherTest
displayName: Run mscclpp AllGather test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: SendRecvTest
displayName: Run mscclpp SendRecv test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp AllReduce test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllToAll
displayName: Run mscclpp AllToAll test
inputs:
targetType: 'inline'
script: |
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: CheckPerfNumber
displayName: Check collective primitives performance
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: FifoPerfBenchmark
displayName: FIFO Performance Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
./build/bin/perf/fifo_test"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,76 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: AllGatherTest
displayName: Run mscclpp AllGather test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: SendRecvTest
displayName: Run mscclpp SendRecv test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllReduceTest
displayName: Run mscclpp AllReduce test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllToAll
displayName: Run mscclpp AllToAll test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: CheckPerfNumber
displayName: Check collective primitives performance
remoteScript: |
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
- template: run-remote-task.yml
parameters:
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
remoteScript: |
python3 -m pip install .
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,282 +0,0 @@
# .azure-pipelines/templates/nccl-test.yaml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# sshKeySecureFile the secureFile name for your SSH key
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: nccltest-single-node
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: GenerateExecutionFile
# displayName: Generate execution file
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp/msccl-users; \
# mkdir -p execution-files; \
# cd /root/mscclpp/msccl-users; \
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNcclTests
displayName: Install NCCL Tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllReduceTest
# displayName: Run NCCL AllReduce Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllGatherTest
# displayName: Run NCCL AllGather Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterTest
# displayName: Run NCCL Reduce Scatter Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNccl
displayName: Install NCCL
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
cd nccl; \
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterFallbaclkToNcclTest
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,76 @@
# .azure-pipelines/templates/nccl-test.yml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
deployArgs: 'nccltest-single-node'
- template: run-remote-task.yml
parameters:
name: InstallNcclTests
displayName: Install NCCL Tests
remoteScript: |
cd
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: InstallNccl
displayName: Install NCCL
remoteScript: |
LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
if [ -z "$LATEST_TAG" ]; then
echo "Failed to fetch latest NCCL tag"
exit 1
fi
cd
git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
- template: run-remote-task.yml
parameters:
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,142 +0,0 @@
# .azure-pipelines/templates/rccl-test.yaml
# ------------------------------------------------
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# sshKeySecureFile the secureFile name for your SSH key
# gpuArch GPU architecture (e.g. gfx942)
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true rocm"
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: InstallRcclTests
displayName: Install RCCL Tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; \
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
cd rocm-systems; \
git sparse-checkout init --cone; \
git sparse-checkout set projects/rccl-tests; \
git checkout; \
cd projects/rccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunRcclAllGatherTest
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunRcclAllReduceTest
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,63 @@
# .azure-pipelines/templates/rccl-test.yml
# ------------------------------------------------
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# gpuArch GPU architecture (e.g. gfx942)
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: rocm
gpuArch: ${{ parameters.gpuArch }}
buildTests: false
deployArgs: 'single-node-test true rocm'
- template: run-remote-task.yml
parameters:
name: InstallRcclTests
displayName: Install RCCL Tests
remoteScript: |
cd
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
cd rocm-systems
git sparse-checkout init --cone
git sparse-checkout set projects/rccl-tests
git checkout
cd projects/rccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: RunRcclAllGatherTest
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunRcclAllReduceTest
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,27 @@
parameters:
- name: name
type: string
default: ''
- name: displayName
type: string
- name: runRemoteArgs
type: string
default: ''
- name: remoteScript
type: string
- name: workingDirectory
type: string
default: '$(System.DefaultWorkingDirectory)'
steps:
- task: Bash@3
${{ if ne(parameters.name, '') }}:
name: ${{ parameters.name }}
displayName: ${{ parameters.displayName }}
inputs:
targetType: 'inline'
script: |
test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
${{ parameters.remoteScript }}
REMOTE_CMD
workingDirectory: ${{ parameters.workingDirectory }}

View File

@@ -0,0 +1,20 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
steps:
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}

View File

@@ -1,191 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: UnitTests
displayName: Run mscclpp unit tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: StopContainer
displayName: Stop existing container
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: BuildWithIb
displayName: Rebuild with IB
inputs:
targetType: 'inline'
script: |
rm -rf build && mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: DeployTestEnvWithIb
displayName: Deploy Test Env (with IB build)
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: PyTestsWithIbBuildDisableIb
displayName: Run pytests (IB build, IB tests disabled)
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,95 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
deployArgs: 'single-node-test false'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: run-remote-task.yml
parameters:
name: StopContainer
displayName: Stop existing container
runRemoteArgs: '--no-docker --no-log'
remoteScript: |
sudo docker stop mscclpp-test || true
sudo docker rm mscclpp-test || true
- task: Bash@3
displayName: Remove generated SSH key files
inputs:
targetType: 'inline'
script: |
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: BuildWithIb
displayName: Rebuild with IB
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
-DMSCCLPP_BUILD_TESTS=ON \
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: DeployTestEnvWithIb
displayName: Deploy Test Env (with IB build)
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- template: run-remote-task.yml
parameters:
name: PyTestsWithIbBuildDisableIb
displayName: Run pytests (IB build, IB tests disabled)
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,145 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
mkdir -p build && cd build; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
make -j"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: 'inline'
script: |
# set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,57 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,142 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
else
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
fi
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true ${{ parameters.platform }}"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UnitTests
displayName: Run mscclpp unit tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,48 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,50 +0,0 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
platform: rocm
gpuArch: gfx942

View File

@@ -37,17 +37,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestWithNpKitH100
@@ -99,17 +96,16 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestNoIBEnv
@@ -121,15 +117,34 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-no-ib-env.yaml
- template: templates/ut-no-ib-env.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942

24
.codecov.yml Normal file
View File

@@ -0,0 +1,24 @@
codecov:
require_ci_to_pass: yes
coverage:
status:
project:
default:
target: 68%
threshold: 1%
patch:
default:
target: 80%
flag_management:
default_rules:
carryforward: true
ignore:
- "test/"
- "examples/"
- "python/"
- "tools/"
- "docs/"
- "docker/"

View File

@@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- name: Checkout repository
@@ -62,7 +62,7 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis

View File

@@ -1,69 +0,0 @@
name: IntegrationTest
on: workflow_dispatch
jobs:
IntegrationTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: Run mscclpp AllGather test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl

View File

@@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- uses: actions/checkout@v4

View File

@@ -1,52 +0,0 @@
name: UnitTest
on: workflow_dispatch
jobs:
UnitTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
timeout-minutes: 30
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}
- name: LockGPUClock
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: UnitTests
run: |
./build/bin/unit_tests
- name: MpUnitTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- name: PyTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
.vscode/
build/
build_coverage/
__pycache__
.*.swp
*.so

View File

@@ -1,5 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CXX)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
@@ -99,6 +100,62 @@ else()
message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
endif()
endif()
# Code coverage setup
if(MSCCLPP_ENABLE_COVERAGE)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
message(STATUS "Code coverage enabled")
# Add coverage flags to C++ targets only (not CUDA)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
# Find lcov
find_program(LCOV_PATH lcov)
if(NOT LCOV_PATH)
message(WARNING "lcov not found. Install lcov to generate coverage reports.")
endif()
if(LCOV_PATH)
# Add coverage target
add_custom_target(coverage
COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
COMMAND ${LCOV_PATH} --directory . --zerocounters
COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Generating code coverage report"
)
# Add coverage clean target
add_custom_target(coverage-clean
COMMAND ${CMAKE_COMMAND} -E remove coverage.info
COMMAND ${LCOV_PATH} --directory . --zerocounters
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Cleaning coverage data"
)
endif()
else()
message(WARNING "Code coverage is only supported with GCC or Clang compilers")
endif()
endif()
if(MSCCLPP_GPU_ARCHS)
string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")

View File

@@ -3,15 +3,16 @@
[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
| Testing Pipelines | Build Status |
|--------------------------|-------------------|
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| NCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
| RCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
A GPU-driven communication stack for scalable AI applications.

View File

@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install lcov 2.2
RUN LCOV_VERSION="2.2" && \
apt-get update && \
apt-get install -y --no-install-recommends \
cpanminus \
gcc \
make \
perl \
&& \
cpanm --notest \
Capture::Tiny \
DateTime \
JSON::XS \
Memory::Process \
TimeDate \
&& \
cd /tmp && \
curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
tar xzf lcov.tar.gz && \
cd lcov-${LCOV_VERSION} && \
make install && \
cd / && rm -rf /tmp/lcov* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install CMake 3.26.4
RUN OS_ARCH=$(uname -m) && \
CMAKE_VERSION="3.26.4" && \
@@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
fi && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r python/requirements_${target_type}.txt
pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
pip install --no-cache-dir coverage xlsxwriter
# Cleanup
RUN rm -rf /tmp/mscclpp

View File

@@ -4,27 +4,27 @@ set -e
declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
)
declare -A extraLdPathTable
extraLdPathTable=(
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
["cuda11.8"]="/usr/local/cuda-11.8/compat"
["cuda12.4"]="/usr/local/cuda-12.4/compat"
["cuda12.8"]="/usr/local/cuda-12.8/compat"
["cuda12.9"]="/usr/local/cuda-12.9/compat"
["cuda13.0"]="/usr/local/cuda-13.0/compat"
["rocm6.2"]="/opt/rocm/lib"
)
declare -A ofedVersionTable
ofedVersionTable=(
["cuda11.8"]="23.07-0.5.1.2"
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
@@ -36,7 +36,7 @@ TARGET=${1}
OS_ARCH=$(uname -m)
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then

View File

@@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
```bash
# For NVIDIA platforms
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
# For AMD platforms
$ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
```
@@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
@@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```

View File

@@ -1,13 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
find_package(MPI)
find_package(MPI REQUIRED)
set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
if(MSCCLPP_USE_IB)
list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
endif()
set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include)
@@ -17,6 +16,7 @@ if(MSCCLPP_USE_ROCM)
foreach(arch ${MSCCLPP_GPU_ARCHS})
add_compile_options(--offload-arch=${arch})
endforeach()
add_compile_definitions(__HIP_PLATFORM_AMD__)
endif()
function(add_test_executable name sources)
@@ -38,28 +38,25 @@ add_test_executable(executor_test executor_test.cc)
configure_file(run_mpi_test.sh.in run_mpi_test.sh)
include(CTest)
include(FetchContent)
FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip)
option(INSTALL_GTEST OFF)
FetchContent_MakeAvailable(googletest)
include(GoogleTest)
# Build test framework library
add_library(test_framework STATIC framework.cc)
target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON})
target_link_libraries(test_framework PUBLIC MPI::MPI_CXX)
# Unit tests
add_executable(unit_tests)
target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST})
target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework)
target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
add_subdirectory(unit)
gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST)
add_test(NAME unit_tests COMMAND unit_tests)
# Multi-process unit tests
add_executable(mp_unit_tests)
target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX)
target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX)
target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})
add_subdirectory(mp_unit)
gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST)
add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2)
# mscclpp-test
add_subdirectory(mscclpp-test)
# Performance tests
add_subdirectory(perf)

View File

@@ -1,4 +1,4 @@
set -e
set -ex
TEST_NAME=$1
IB_ENVIRONMENT="${2:-true}"

107
test/deploy/run-remote.sh Executable file
View File

@@ -0,0 +1,107 @@
#!/bin/bash
# Run a command on remote CI VMs via parallel-ssh.
# By default, runs inside the mscclpp-test docker container.
#
# Usage:
# run-remote.sh [OPTIONS] < <command_script>
#
# Options:
# --no-docker Run command directly on the host, not inside docker
# --no-log Don't tail the log file in the background
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
# --host Run command on a single host (uses parallel-ssh -H)
# --user SSH user when using --host or custom hostfile
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
USE_DOCKER=true
USE_LOG=true
TARGET_HOST=""
REMOTE_USER=""
usage() {
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
}
require_value() {
local opt="$1"
local val="$2"
if [ -z "$val" ]; then
echo "Missing value for ${opt}" >&2
exit 1
fi
}
while [[ "$1" == --* ]]; do
case "$1" in
--no-docker) USE_DOCKER=false; shift ;;
--no-log) USE_LOG=false; shift ;;
--hostfile)
require_value "--hostfile" "${2-}"
HOSTFILE="$2"
shift 2
;;
--host)
require_value "--host" "${2-}"
TARGET_HOST="$2"
shift 2
;;
--user)
require_value "--user" "${2-}"
REMOTE_USER="$2"
shift 2
;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
if [ $# -ne 0 ] || [ -t 0 ]; then
usage
exit 1
fi
CMD=$(cat)
if [ -z "$CMD" ]; then
usage
exit 1
fi
CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
PSSH_TARGET_ARGS=()
if [ -n "$TARGET_HOST" ]; then
PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
else
PSSH_TARGET_ARGS=(-h "$HOSTFILE")
fi
PSSH_USER_ARGS=()
if [ -n "$REMOTE_USER" ]; then
PSSH_USER_ARGS=(-l "$REMOTE_USER")
fi
PSSH_COMMON=(
-t 0
"${PSSH_TARGET_ARGS[@]}"
"${PSSH_USER_ARGS[@]}"
-x "-i ${KeyFilePath}"
-O "$SSH_OPTION"
)
if $USE_DOCKER; then
INNER="set -euxo pipefail;"
INNER+=" cd /root/mscclpp;"
INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
INNER+=" CMD_B64='${CMD_B64}';"
INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
parallel-ssh -i "${PSSH_COMMON[@]}" \
"sudo docker exec mscclpp-test bash -c \"${INNER}\""
else
parallel-ssh -i "${PSSH_COMMON[@]}" \
"set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
fi

View File

@@ -1,6 +1,5 @@
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
export PATH=/usr/local/mpi/bin:$PATH
function run_mscclpp_test()
{

View File

@@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr<mscclpp::Bootstrap> bootstrap, std::s
int main(int argc, char* argv[]) {
if (argc != 5 && argc != 6) {
std::cerr << "Usage: " << argv[0] << " <buffer size>"
<< " <execution plan path>"
<< " <number of iterations>"
<< " <number of graph iterations>"
<< " (optional) <packet type>" << std::endl;
std::cerr << "Usage: " << argv[0] << " <buffer size> <execution plan path>"
<< " <number of iterations> <number of graph iterations> (optional) <packet type>" << std::endl;
return 1;
}
@@ -142,7 +139,8 @@ int main(int argc, char* argv[]) {
NpKit::Shutdown();
}
std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl;
double latencyUs = deltaSec * 1.e6;
std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl;
MPI_Finalize();
return 0;
}

323
test/framework.cc Normal file
View File

@@ -0,0 +1,323 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "framework.hpp"
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <stdexcept>
namespace mscclpp {
namespace test {
// Global state
static int gMpiRank = 0;
static int gMpiSize = 1;
static bool gMpiInitialized = false;
static bool gCurrentTestPassed = true;
static std::string gCurrentTestFailureMessage;
static std::string gCurrentTestName;
std::string currentTestName() { return gCurrentTestName; }
namespace utils {
void initializeMPI(int argc, char* argv[]) {
if (gMpiInitialized) return;
int initialized = 0;
MPI_Initialized(&initialized);
if (!initialized) {
MPI_Init(&argc, &argv);
}
MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank);
MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize);
gMpiInitialized = true;
}
static void finalizeMPI() {
if (!gMpiInitialized) return;
MPI_Finalize();
gMpiInitialized = false;
}
bool isMainRank() { return gMpiRank == 0; }
int getMPIRank() { return gMpiRank; }
int getMPISize() { return gMpiSize; }
void cleanupMPI() { finalizeMPI(); }
void reportFailure(const char* file, int line, const std::string& message) {
gCurrentTestPassed = false;
std::ostringstream oss;
oss << file << ":" << line << ": " << message;
if (!gCurrentTestFailureMessage.empty()) {
gCurrentTestFailureMessage += "\n";
}
gCurrentTestFailureMessage += oss.str();
std::cerr << oss.str() << std::endl;
}
void reportSuccess() {
gCurrentTestPassed = true;
gCurrentTestFailureMessage.clear();
}
// Timer implementation
Timer::Timer() : isRunning_(false) {}
void Timer::start() {
startTime_ = std::chrono::high_resolution_clock::now();
isRunning_ = true;
}
void Timer::stop() {
endTime_ = std::chrono::high_resolution_clock::now();
isRunning_ = false;
}
double Timer::elapsedMicroseconds() const {
if (isRunning_) {
auto now = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(now - startTime_).count();
}
return std::chrono::duration_cast<std::chrono::microseconds>(endTime_ - startTime_).count();
}
double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
void cudaCheck(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
std::string msg =
std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
throw std::runtime_error(msg);
}
}
} // namespace utils
// TestRegistry implementation
TestRegistry& TestRegistry::instance() {
static TestRegistry registry;
return registry;
}
void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
bool isPerfTest) {
tests_.push_back({suiteName, testName, std::move(factory), isPerfTest});
}
void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); }
// Returns true if the test should run given the filter string.
// Filter syntax:
// "" -> run all
// "Pattern" -> run only tests whose full name contains Pattern
// "-Pattern" -> run all tests EXCEPT those whose full name contains Pattern
static bool matchesFilter(const std::string& fullName, const std::string& filter) {
if (filter.empty()) return true;
if (filter[0] == '-') {
// Negative filter: exclude tests matching any comma-separated pattern
std::string patterns = filter.substr(1);
size_t pos = 0;
while (pos < patterns.size()) {
size_t comma = patterns.find(',', pos);
std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos);
if (!pattern.empty() && fullName.find(pattern) != std::string::npos) {
return false;
}
pos = (comma == std::string::npos) ? patterns.size() : comma + 1;
}
return true;
}
// Positive filter: include only matching tests
return fullName.find(filter) != std::string::npos;
}
int TestRegistry::runAllTests(int argc, char* argv[]) {
// Initialize MPI if not already initialized
if (!gMpiInitialized) {
utils::initializeMPI(argc, argv);
}
// Parse command line arguments
std::string filter;
bool excludePerfTests = false;
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
if (arg.find("--filter=") == 0) {
filter = arg.substr(9); // Length of "--filter="
} else if (arg == "--filter" && i + 1 < argc) {
filter = argv[i + 1];
++i;
} else if (arg == "--exclude-perf-tests") {
excludePerfTests = true;
}
}
// Set up global test environments
for (auto* env : environments_) {
try {
env->SetUp();
} catch (const std::exception& e) {
if (gMpiRank == 0) {
std::cerr << "Failed to set up test environment: " << e.what() << std::endl;
}
return 1;
}
}
int passed = 0;
int failed = 0;
int skipped = 0;
// Count tests to run
int totalToRun = 0;
int skippedByFilter = 0;
for (const auto& entry : tests_) {
std::string fullName = entry.suiteName + "." + entry.testName;
if (excludePerfTests && entry.isPerfTest) {
skippedByFilter++;
continue;
}
if (!matchesFilter(fullName, filter)) {
skippedByFilter++;
continue;
}
totalToRun++;
}
if (gMpiRank == 0) {
std::cout << "[==========] Running " << totalToRun << " tests";
if (skippedByFilter > 0) {
std::cout << " (" << skippedByFilter << " skipped by filter)";
}
std::cout << ".\n";
}
for (const auto& entry : tests_) {
std::string fullName = entry.suiteName + "." + entry.testName;
if (excludePerfTests && entry.isPerfTest) continue;
if (!matchesFilter(fullName, filter)) continue;
gCurrentTestPassed = true;
gCurrentTestFailureMessage.clear();
gCurrentTestName = fullName;
if (gMpiRank == 0) {
std::cout << "[ RUN ] " << fullName << std::endl;
}
TestCase* testCase = nullptr;
bool testSkipped = false;
bool setUpSucceeded = false;
try {
testCase = entry.factory();
testCase->SetUp();
setUpSucceeded = true;
testCase->TestBody();
} catch (const SkipException& e) {
gCurrentTestPassed = true;
testSkipped = true;
if (gMpiRank == 0) {
std::cout << "[ SKIPPED ] " << fullName << ": " << e.what() << std::endl;
}
} catch (const std::exception& e) {
gCurrentTestPassed = false;
if (gCurrentTestFailureMessage.empty()) {
gCurrentTestFailureMessage = e.what();
}
} catch (...) {
gCurrentTestPassed = false;
if (gCurrentTestFailureMessage.empty()) {
gCurrentTestFailureMessage = "Unknown exception";
}
}
// Always call TearDown() if SetUp() succeeded, even if TestBody() threw
if (setUpSucceeded && testCase != nullptr) {
try {
testCase->TearDown();
} catch (const std::exception& e) {
// If test already failed, keep original failure message
if (gCurrentTestPassed) {
gCurrentTestPassed = false;
gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what();
}
} catch (...) {
if (gCurrentTestPassed) {
gCurrentTestPassed = false;
gCurrentTestFailureMessage = "TearDown() failed with unknown exception";
}
}
}
delete testCase;
gCurrentTestName.clear();
if (testSkipped) {
skipped++;
continue;
}
// Synchronize test status across all MPI processes
int localPassed = gCurrentTestPassed ? 1 : 0;
int globalPassed = 1;
if (gMpiInitialized) {
MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
} else {
globalPassed = localPassed;
}
if (gMpiRank == 0) {
if (globalPassed) {
std::cout << "[ OK ] " << fullName << std::endl;
passed++;
} else {
std::cout << "[ FAILED ] " << fullName << std::endl;
failed++;
}
}
}
if (gMpiRank == 0) {
std::cout << "[==========] " << totalToRun << " tests ran.\n";
if (passed > 0) {
std::cout << "[ PASSED ] " << passed << " tests.\n";
}
if (skipped > 0) {
std::cout << "[ SKIPPED ] " << skipped << " tests.\n";
}
if (failed > 0) {
std::cout << "[ FAILED ] " << failed << " tests.\n";
}
}
// Tear down global test environments (in reverse order)
for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) {
try {
(*it)->TearDown();
delete *it;
} catch (const std::exception& e) {
if (gMpiRank == 0) {
std::cerr << "Failed to tear down test environment: " << e.what() << std::endl;
}
}
}
environments_.clear();
return failed > 0 ? 1 : 0;
}
} // namespace test
} // namespace mscclpp

405
test/framework.hpp Normal file
View File

@@ -0,0 +1,405 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_
#define MSCCLPP_TEST_FRAMEWORK_HPP_
#include <mpi.h>
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <mscclpp/gpu.hpp>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
namespace mscclpp {
namespace test {
// Test case base class
class TestCase {
public:
virtual ~TestCase() = default;
virtual void SetUp() {}
virtual void TearDown() {}
virtual void TestBody() = 0;
};
// Environment base class (for global test setup/teardown)
class Environment {
public:
virtual ~Environment() = default;
virtual void SetUp() {}
virtual void TearDown() {}
};
// Test registry and runner
class TestRegistry {
public:
using TestFactory = std::function<TestCase*()>;
static TestRegistry& instance();
void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory,
bool isPerfTest = false);
void addEnvironment(Environment* env);
int runAllTests(int argc, char* argv[]);
private:
TestRegistry() = default;
struct TestEntry {
std::string suiteName;
std::string testName;
TestFactory factory;
bool isPerfTest;
};
std::vector<TestEntry> tests_;
std::vector<Environment*> environments_;
};
// Returns "Suite.Name" for the currently running test, or "" if none.
std::string currentTestName();
// Utility functions
namespace utils {
// MPI management
void initializeMPI(int argc, char* argv[]);
void cleanupMPI();
bool isMainRank();
int getMPIRank();
int getMPISize();
// Timing utilities
class Timer {
public:
Timer();
void start();
void stop();
double elapsedMicroseconds() const;
double elapsedMilliseconds() const;
double elapsedSeconds() const;
private:
std::chrono::high_resolution_clock::time_point startTime_;
std::chrono::high_resolution_clock::time_point endTime_;
bool isRunning_;
};
// CUDA utilities
void cudaCheck(cudaError_t err, const char* file, int line);
#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__)
// Test assertion helpers
void reportFailure(const char* file, int line, const std::string& message);
void reportSuccess();
} // namespace utils
// Exception for test skips
class SkipException : public std::runtime_error {
public:
explicit SkipException(const std::string& message) : std::runtime_error(message) {}
};
// Helper class for FAIL() macro — supports message streaming via operator<<
class FailHelper {
public:
explicit FailHelper(const char* file, int line) : file_(file), line_(line) {}
template <typename T>
FailHelper& operator<<(const T& value) {
message_ << value;
return *this;
}
~FailHelper() noexcept(false) {
std::string msg = message_.str();
if (!msg.empty()) {
::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg);
} else {
::mscclpp::test::utils::reportFailure(file_, line_, "Test failed");
}
throw std::runtime_error("Test failed");
}
private:
const char* file_;
int line_;
std::ostringstream message_;
};
// Helper class for SKIP_TEST() macro — supports message streaming via operator<<
// Usage: SKIP_TEST() << "Reason for skipping";
class SkipHelper {
public:
explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {}
template <typename T>
SkipHelper& operator<<(const T& value) {
message_ << value;
return *this;
}
~SkipHelper() noexcept(false) {
std::string msg = message_.str();
if (!msg.empty()) {
throw SkipException("Test skipped: " + msg);
} else {
throw SkipException("Test skipped");
}
}
private:
const char* file_;
int line_;
std::ostringstream message_;
};
// SFINAE helper: resolves to T if T is a complete type (user-defined fixture),
// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class.
namespace detail {
template <typename...>
using void_t = void;
template <typename T, typename = void_t<>>
struct FixtureOf {
using type = TestCase;
};
template <typename T>
struct FixtureOf<T, void_t<decltype(sizeof(T))>> {
using type = T;
};
} // namespace detail
} // namespace test
} // namespace mscclpp
// --- Test registration macros ---
// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture).
// Otherwise, the test inherits from TestCase (no fixture needed).
#define TEST(test_fixture, test_name) \
class test_fixture; \
class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
public: \
void TestBody() override; \
}; \
static bool test_fixture##_##test_name##_registered = []() { \
::mscclpp::test::TestRegistry::instance().registerTest( \
#test_fixture, #test_name, \
[]() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \
return true; \
}(); \
void test_fixture##_##test_name##_Test::TestBody()
#define PERF_TEST(test_fixture, test_name) \
class test_fixture; \
class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf<test_fixture>::type { \
public: \
void TestBody() override; \
}; \
static bool test_fixture##_##test_name##_registered = []() { \
::mscclpp::test::TestRegistry::instance().registerTest( \
#test_fixture, #test_name, \
[]() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true); \
return true; \
}(); \
void test_fixture##_##test_name##_Test::TestBody()
// --- Test runner macro ---
#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv)
// Assertion macros
#define EXPECT_TRUE(condition) \
do { \
if (!(condition)) { \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
} \
} while (0)
#define EXPECT_FALSE(condition) \
do { \
if (condition) { \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
} \
} while (0)
#define EXPECT_EQ(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 == v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " == " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define EXPECT_NE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 != v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " != " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define EXPECT_LT(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 < v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " < " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define EXPECT_LE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 <= v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " <= " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define EXPECT_GT(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 > v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " > " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define EXPECT_GE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 >= v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " >= " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
} \
} while (0)
#define ASSERT_TRUE(condition) \
do { \
if (!(condition)) { \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_FALSE(condition) \
do { \
if (condition) { \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_EQ(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 == v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " == " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_NE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 != v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " != " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_LT(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 < v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " < " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_LE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 <= v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " <= " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_GT(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 > v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " > " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_GE(val1, val2) \
do { \
auto v1 = (val1); \
auto v2 = (val2); \
if (!(v1 >= v2)) { \
std::ostringstream oss; \
oss << "Expected: " #val1 " >= " #val2 << "\n Actual: " << v1 << " vs " << v2; \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
#define ASSERT_NO_THROW(statement) \
do { \
try { \
statement; \
} catch (const std::exception& e) { \
std::ostringstream oss; \
oss << "Expected: " #statement " not to throw\n Actual: threw " << e.what(); \
::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \
throw std::runtime_error("Test assertion failed"); \
} catch (...) { \
::mscclpp::test::utils::reportFailure( \
__FILE__, __LINE__, "Expected: " #statement " not to throw\n Actual: threw unknown exception"); \
throw std::runtime_error("Test assertion failed"); \
} \
} while (0)
// --- Test control macros ---
// Fail the current test immediately. Usage: FAIL() << "reason";
#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__)
// Skip the current test. Usage: SKIP_TEST() << "reason";
#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__)
#endif // MSCCLPP_TEST_FRAMEWORK_HPP_

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <mpi.h>
@@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr<mscclpp::Bootstrap> bootstr
bootstrapTestSendRecv(bootstrap);
}
TEST_F(BootstrapTest, WithId) {
TEST(BootstrapTest, WithId) {
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
mscclpp::UniqueId id;
if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
@@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) {
bootstrapTestAll(bootstrap);
}
TEST_F(BootstrapTest, WithIpPortPair) {
TEST(BootstrapTest, WithIpPortPair) {
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
bootstrap->initialize(gEnv->args["ip_port"]);
bootstrapTestAll(bootstrap);
}
TEST_F(BootstrapTest, ResumeWithId) {
TEST(BootstrapTest, ResumeWithId) {
// This test may take a few minutes.
bootstrapTestTimer.set(300);
@@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) {
}
}
TEST_F(BootstrapTest, ResumeWithIpPortPair) {
TEST(BootstrapTest, ResumeWithIpPortPair) {
for (int i = 0; i < 5; ++i) {
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
bootstrap->initialize(gEnv->args["ip_port"]);
}
}
TEST_F(BootstrapTest, ExitBeforeConnect) {
TEST(BootstrapTest, ExitBeforeConnect) {
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
bootstrap->createUniqueId();
}
TEST_F(BootstrapTest, TimeoutWithId) {
TEST(BootstrapTest, TimeoutWithId) {
mscclpp::Timer timer;
// All ranks initialize a bootstrap with their own id (will hang)
@@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) {
// Set bootstrap timeout to 1 second
bootstrap->initialize(id, 1);
} catch (const mscclpp::Error& e) {
ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout);
ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout);
}
// Timeout should be sligtly greater than 1 second
@@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap {
}
};
TEST_F(BootstrapTest, MPIBootstrap) {
TEST(BootstrapTest, MPIBootstrap) {
auto bootstrap = std::make_shared<MPIBootstrap>();
bootstrapTestAll(bootstrap);
}

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <mpi.h>
@@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) {
return true;
}
TEST_F(CommunicatorTest, BasicWrite) {
TEST(CommunicatorTest, BasicWrite) {
if (gEnv->rank >= numRanksToUse) return;
deviceBufferInit();
@@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle
}
}
TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
TEST(CommunicatorTest, WriteWithDeviceSemaphores) {
if (gEnv->rank >= numRanksToUse) return;
std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> semaphores;
@@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) {
communicator->bootstrap()->barrier();
}
TEST_F(CommunicatorTest, WriteWithHostSemaphores) {
TEST(CommunicatorTest, WriteWithHostSemaphores) {
if (gEnv->rank >= numRanksToUse) return;
std::unordered_map<int, std::shared_ptr<mscclpp::Host2HostSemaphore>> semaphores;

View File

@@ -1,7 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <mpi.h>
#include <unistd.h>
#include <filesystem>
#include <mscclpp/env.hpp>
@@ -22,7 +23,7 @@ std::string getExecutablePath() {
void ExecutorTest::SetUp() {
if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) {
GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2";
SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2";
}
MultiProcessTest::SetUp();
@@ -49,7 +50,7 @@ void ExecutorTest::TearDown() {
MultiProcessTest::TearDown();
}
TEST_F(ExecutorTest, TwoNodesAllreduce) {
TEST(ExecutorTest, TwoNodesAllreduce) {
std::string executablePath = getExecutablePath();
std::filesystem::path path = executablePath;
std::filesystem::path executionFilesPath =

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <mpi.h>
@@ -18,9 +18,7 @@ void IbTestBase::SetUp() {
}
void IbPeerToPeerTest::SetUp() {
#if !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
REQUIRE_IBVERBS;
IbTestBase::SetUp();
@@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6
qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
}
TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
TEST(IbPeerToPeerTest, SimpleSendRecv) {
if (gEnv->rank >= 2) {
// This test needs only two ranks
return;
@@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
}
}
TEST_F(IbPeerToPeerTest, MemoryConsistency) {
TEST(IbPeerToPeerTest, MemoryConsistency) {
if (gEnv->rank >= 2) {
// This test needs only two ranks
return;
@@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) {
EXPECT_EQ(res, 0);
}
TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
TEST(IbPeerToPeerTest, SimpleAtomicAdd) {
if (gEnv->rank >= 2) {
// This test needs only two ranks
return;

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <algorithm>
@@ -8,7 +8,7 @@
void MemoryChannelOneToOneTest::SetUp() {
// Need at least two ranks within a node
if (gEnv->nRanksPerNode < 2) {
GTEST_SKIP();
SKIP_TEST();
}
// Use only two ranks
setNumRanksToUse(2);
@@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
// The least nelem is 2 for packet ping pong
kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
*ret = 0;
kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
*ret = 0;
kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries);
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
}
int nTries = 1000000;
communicator->bootstrap()->barrier();
@@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
}
}
TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
TEST(MemoryChannelOneToOneTest, PutPingPong) {
if (gEnv->rank >= numRanksToUse) return;
const int nElem = 4 * 1024 * 1024;
@@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
*ret = 0;
kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
}
}
__global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
@@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
}
}
TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
TEST(MemoryChannelOneToOneTest, GetPingPong) {
if (gEnv->rank >= numRanksToUse) return;
const int nElem = 4 * 1024 * 1024;
@@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
*ret = 0;
kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
}
}
__global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
@@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int*
}
}
TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) {
auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
};
packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
}
TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) {
auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
};

View File

@@ -98,14 +98,18 @@ static std::unordered_map<std::string, std::string> parseArgs(int argc, const ch
continue;
}
// Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras
// Unrecognized positional token: ignore
}
return options;
}
void MultiProcessTestEnv::SetUp() {
MPI_Init(NULL, NULL);
int initialized = 0;
MPI_Initialized(&initialized);
if (!initialized) {
MPI_Init(NULL, NULL);
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
// get the local number of nodes with MPI
@@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() {
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
gEnv = new MultiProcessTestEnv(argc, (const char**)argv);
::testing::AddGlobalTestEnvironment(gEnv);
::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv);
return RUN_ALL_TESTS();
}
TEST_F(MultiProcessTest, Prelim) {
TEST(MultiProcessTest, Prelim) {
// Test to make sure the MPI environment is set up correctly
ASSERT_GE(gEnv->worldSize, 2);
}
TEST_F(MultiProcessTest, HostName) {
TEST(MultiProcessTest, HostName) {
const size_t maxNameLen = 1024;
std::vector<char> buffer(gEnv->worldSize * maxNameLen, '\0');
std::string hostName = mscclpp::getHostName(maxNameLen, '\0');
@@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) {
}
}
TEST_F(MultiProcessTest, HostHash) {
TEST(MultiProcessTest, HostHash) {
std::vector<uint64_t> buffer(gEnv->worldSize, 0);
uint64_t hostHash = mscclpp::getHostHash();
buffer[gEnv->rank] = hostHash;

View File

@@ -4,8 +4,6 @@
#ifndef MSCCLPP_MP_UNIT_TESTS_HPP_
#define MSCCLPP_MP_UNIT_TESTS_HPP_
#include <gtest/gtest.h>
#include <mscclpp/core.hpp>
#include <mscclpp/executor.hpp>
#include <mscclpp/memory_channel.hpp>
@@ -13,10 +11,18 @@
#include <mscclpp/port_channel.hpp>
#include <mscclpp/utils.hpp>
#include "../framework.hpp"
#include "ib.hpp"
#include "utils_internal.hpp"
class MultiProcessTestEnv : public ::testing::Environment {
// Skip the current test if IBVerbs is not available in this build
#if defined(USE_IBVERBS)
#define REQUIRE_IBVERBS
#else
#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support."
#endif
class MultiProcessTestEnv : public ::mscclpp::test::Environment {
public:
MultiProcessTestEnv(int argc, const char** argv);
@@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id);
int rankToLocalRank(int rank);
int rankToNode(int rank);
class MultiProcessTest : public ::testing::Test {
class MultiProcessTest : public ::mscclpp::test::TestCase {
protected:
void TearDown() override;
};

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <cstdint>
#include <mscclpp/concurrency_device.hpp>
@@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
const int nTries = 1000;
kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
*ret = 0;
kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
}
proxyService->stopProxy();
}
@@ -223,8 +209,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
const std::string testName = ::mscclpp::test::currentTestName();
const int nTries = 1000;
// Warm-up
@@ -247,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
proxyService->stopProxy();
}
TEST_F(PortChannelOneToOneTest, PingPong) {
TEST(PortChannelOneToOneTest, PingPong) {
testPingPong(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
}
TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PingPongIbHostMode) {
REQUIRE_IBVERBS;
testPingPong(PingPongTestParams{
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
TEST(PortChannelOneToOneTest, PingPongEthernet) {
testPingPong(PingPongTestParams{
.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
}
TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
TEST(PortChannelOneToOneTest, PingPongWithPoll) {
testPingPong(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
}
TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
REQUIRE_IBVERBS;
testPingPong(PingPongTestParams{
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host});
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PingPongPerf) {
TEST(PortChannelOneToOneTest, PingPongPerf) {
testPingPongPerf(PingPongTestParams{
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
}
TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) {
REQUIRE_IBVERBS;
testPingPongPerf(PingPongTestParams{
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host});
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) {
REQUIRE_IBVERBS;
testPingPongPerf(PingPongTestParams{
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
TEST(PortChannelOneToOneTest, PingPongPerfEthernet) {
testPingPongPerf(PingPongTestParams{
.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default});
}
@@ -407,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) {
std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
const int nTries = 1000;
// The least nelem is 2 for packet ping pong
kernelProxyLLPingPong<true>
<<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelProxyLLPingPong<true>
<<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
1024 * 1024, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
*ret = 0;
kernelProxyLLPingPong<true><<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank,
4 * 1024 * 1024, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) {
*ret = 0;
kernelProxyLLPingPong<true>
<<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
EXPECT_EQ(*ret, 0);
}
communicator->bootstrap()->barrier();
@@ -471,8 +424,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
proxyService->startProxy();
auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
const std::string testName = ::mscclpp::test::currentTestName();
const int nTries = 1000000;
// Warm-up
@@ -497,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
proxyService->stopProxy();
}
TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
REQUIRE_IBVERBS;
testPacketPingPong(true, IbMode::Host);
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
REQUIRE_IBVERBS;
testPacketPingPongPerf(true, IbMode::Host);
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) {
REQUIRE_IBVERBS;
testPacketPingPongPerf(true, IbMode::HostNoAtomic);
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) {
REQUIRE_IBVERBS;
testPingPong(PingPongTestParams{
.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}
TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
#if defined(USE_IBVERBS)
TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) {
REQUIRE_IBVERBS;
testPacketPingPong(true, IbMode::HostNoAtomic);
#else // !defined(USE_IBVERBS)
GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
#endif // !defined(USE_IBVERBS)
}

View File

@@ -1,5 +1,5 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <algorithm>
#include <mscclpp/switch_channel.hpp>
@@ -10,10 +10,10 @@
void SwitchChannelTest::SetUp() {
// Need at least two ranks within a node
if (gEnv->nRanksPerNode < 2) {
GTEST_SKIP();
SKIP_TEST();
}
if (!mscclpp::isNvlsSupported()) {
GTEST_SKIP();
SKIP_TEST();
}
// Use only two ranks
setNumRanksToUse(2);
@@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() {
void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); }
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
__global__ void kernelSwitchReduce() {
#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
@@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() {
#endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
}
TEST_F(SwitchChannelTest, SimpleAllReduce) {
__global__ void kernelSwitchReduceTwo() {
#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
gConstSwitchChan1.broadcast(0, val1);
auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
gConstSwitchChan2.broadcast(0, val2);
#endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
}
TEST(SwitchChannelTest, SimpleAllReduce) {
if (gEnv->rank >= numRanksToUse) return;
std::vector<int> ranks;
@@ -66,22 +77,13 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) {
for (int i = 0; i < numRanksToUse; i++) {
expected += i + 1.0f;
}
ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank;
if (result != expected) {
std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl;
}
ASSERT_EQ(result, expected);
}
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1;
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2;
__global__ void kernelSwitchReduceTwo() {
#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
auto val1 = gConstSwitchChan1.reduce<mscclpp::f32x1>(0);
gConstSwitchChan1.broadcast(0, val1);
auto val2 = gConstSwitchChan2.reduce<mscclpp::f32x1>(0);
gConstSwitchChan2.broadcast(0, val2);
#endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900)
}
TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
TEST(SwitchChannelTest, TwoChannelsSameConnection) {
if (gEnv->rank >= numRanksToUse) return;
std::vector<int> ranks;
@@ -97,12 +99,9 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice));
MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice));
// Connection size must be large enough for two granularity-aligned buffers.
// The multicast granularity is typically 2MB, so we need at least 2 * 2MB.
const size_t connSize = buffer1.bytes() + buffer2.bytes();
auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize);
// Bind two separate buffers to the same connection
auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize);
auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize);
@@ -132,6 +131,6 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) {
expected1 += (i + 1.0f) * 1.0f;
expected2 += (i + 1.0f) * 10.0f;
}
ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1;
ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2;
ASSERT_EQ(result1, expected1);
ASSERT_EQ(result2, expected2);
}

View File

@@ -1,44 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Find required packages
find_package(MPI REQUIRED)
# Note: nlohmann_json::nlohmann_json target is already available from the main project
# Set up common libraries and includes for tests
set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
if(MSCCLPP_USE_IB)
list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
endif()
set(PERF_TEST_INC_COMMON
PRIVATE ${PROJECT_SOURCE_DIR}/include
SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
# Function to add a test executable
function(add_perf_test_executable name sources)
if(MSCCLPP_USE_ROCM)
set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
endif()
add_executable(${name} ${sources})
target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON})
# Link nlohmann_json - use the target from main project
target_link_libraries(${name} nlohmann_json::nlohmann_json)
if(MSCCLPP_USE_IB)
target_compile_definitions(${name} PRIVATE USE_IBVERBS)
endif()
target_include_directories(${name} ${PERF_TEST_INC_COMMON})
target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS)
# Set C++ standard
target_compile_features(${name} PRIVATE cxx_std_17)
set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf")
endfunction()
# Add FIFO test
add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu")

View File

@@ -1,298 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <getopt.h>
#include <iostream>
#include <map>
#include <memory>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <sstream>
#include <stdexcept>
#include "framework.hpp"
using namespace mscclpp::test;
// Constants for timeout and trigger calculation
constexpr uint64_t TIMEOUT_SPINS = 1000000;
constexpr int MIN_TRIGGERS = 1000;
constexpr int MIN_WARMUP_TRIGGERS = 100;
constexpr int TRIGGERS_PER_FIFO_SIZE = 10;
constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2;
__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle;
__global__ void kernelFifoPush(size_t numTriggers) {
mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
mscclpp::ProxyTrigger trigger;
for (size_t i = 1; i <= numTriggers; ++i) {
trigger.fst = i;
trigger.snd = tid ^ i;
fifo.push(trigger);
}
}
__global__ void kernelFifoPushSync(size_t numTriggers) {
mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle;
mscclpp::ProxyTrigger trigger;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i = 1; i <= numTriggers; ++i) {
trigger.fst = i;
trigger.snd = tid ^ i;
fifo.sync(fifo.push(trigger));
}
}
static void setupCuda(int& cudaDevice, int& numaNode) {
utils::CUDA_CHECK(cudaGetDevice(&cudaDevice));
numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
mscclpp::numaBind(numaNode);
}
// Helper function to consume triggers from FIFO
static bool consumeTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
int totalTriggers = numTriggers * parallel;
std::unordered_map<int, int> triggerCounts;
for (int i = 0; i < totalTriggers; ++i) {
mscclpp::ProxyTrigger trigger;
uint64_t spin = 0;
do {
trigger = hostFifo->poll();
if (spin++ > TIMEOUT_SPINS) {
return false;
}
} while (trigger.fst == 0 || trigger.snd == 0);
// Process trigger (see src/proxy.cc)
trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
trigger.snd = trigger.snd ^ trigger.fst;
assert(triggerCounts[trigger.snd] + 1 == trigger.fst);
triggerCounts[trigger.snd]++;
hostFifo->pop();
}
return true;
}
// Helper function to run a single kernel variant and return performance metrics
std::tuple<double, double, int, int> runSingleKernelVariant(void (*kernel)(size_t),
std::unique_ptr<mscclpp::Fifo>& hostFifo,
cudaStream_t stream, int numParallel) {
// Calculate triggers based on FIFO size
const int numTriggers = std::max(MIN_TRIGGERS, static_cast<int>(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE));
const int warmupTriggers =
std::max(MIN_WARMUP_TRIGGERS, static_cast<int>(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE));
// Warmup
kernel<<<numParallel, 1, 0, stream>>>(warmupTriggers);
utils::CUDA_CHECK(cudaGetLastError());
// Process warmup triggers (note: total triggers = warmupTriggers * numParallel)
if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) {
return {0.0, 0.0, 0, 0}; // Return error values
}
utils::CUDA_CHECK(cudaStreamSynchronize(stream));
// Benchmark
utils::Timer timer;
timer.start();
kernel<<<numParallel, 1, 0, stream>>>(numTriggers);
utils::CUDA_CHECK(cudaGetLastError());
// Process all triggers
if (!consumeTriggers(hostFifo, numTriggers, numParallel)) {
return {0.0, 0.0, 0, 0};
}
utils::CUDA_CHECK(cudaStreamSynchronize(stream));
timer.stop();
const int totalTriggers = numTriggers * numParallel;
double throughput = totalTriggers / timer.elapsedSeconds();
double duration_us = timer.elapsedMicroseconds();
utils::CUDA_CHECK(cudaDeviceSynchronize());
return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel};
}
void runFifoTestVariant(std::unique_ptr<mscclpp::Fifo>& hostFifo, cudaStream_t stream, int numParallel,
nlohmann::ordered_json& combinedMetrics) {
auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] =
runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel);
auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] =
runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel);
auto formatThroughput = [](double thru) {
return double(int(thru * 10)) / 10.0; // Round to 1 decimal place
};
std::string prefix = "p" + std::to_string(numParallel) + "_";
combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput);
combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput);
combinedMetrics[prefix + "push_duration_us"] = pushDuration;
combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration;
combinedMetrics[prefix + "num_triggers"] = numTriggers;
combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers;
}
struct FifoTestConfig {
int fifoSize;
std::vector<int> parallelismLevels;
// Constructor with default parallelism levels
FifoTestConfig(int size, const std::vector<int>& parallel = {1, 2, 4, 8, 16})
: fifoSize(size), parallelismLevels(parallel) {}
};
void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize,
[[maybe_unused]] int localRank) {
if (config.fifoSize <= 0) {
throw std::invalid_argument("FIFO size must be positive");
}
if (config.parallelismLevels.empty()) {
throw std::invalid_argument("At least one parallelism level must be specified");
}
int cudaDevice, numaNode;
setupCuda(cudaDevice, numaNode);
auto hostFifo = std::make_unique<mscclpp::Fifo>(config.fifoSize);
mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
cudaStream_t stream;
utils::CUDA_CHECK(cudaStreamCreate(&stream));
// Create test name with parallelism range
std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel";
// Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64")
if (!config.parallelismLevels.empty()) {
testName += std::to_string(config.parallelismLevels.front());
if (config.parallelismLevels.size() > 1) {
testName += "-" + std::to_string(config.parallelismLevels.back());
// If parallelism levels have non-standard steps, include more detail
if (config.parallelismLevels.size() > 2 &&
(config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) {
testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom";
}
}
}
// Print test configuration
if (utils::isMainRank()) {
std::stringstream ss;
ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=[";
for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
if (i > 0) ss << ",";
ss << config.parallelismLevels[i];
}
ss << "]";
std::cout << ss.str() << std::endl;
}
nlohmann::ordered_json combinedMetrics;
for (int numParallel : config.parallelismLevels) {
runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics);
}
std::map<std::string, std::string> testParams;
testParams["fifo_size"] = std::to_string(static_cast<int>(hostFifo->size()));
// Add parallelism levels to test parameters
std::stringstream parallelismStream;
for (size_t i = 0; i < config.parallelismLevels.size(); ++i) {
if (i > 0) parallelismStream << ",";
parallelismStream << config.parallelismLevels[i];
}
testParams["parallelism_levels"] = parallelismStream.str();
utils::recordResult(testName, "fifo", combinedMetrics, testParams);
utils::CUDA_CHECK(cudaStreamDestroy(stream));
}
void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) {
// clang-format off
std::vector<FifoTestConfig> configs = {
{1, {1}},
{128, {1, 8, 64, 128}},
{512, {1, 8, 64, 256, 512}},
};
// clang-format on
for (const auto& config : configs) {
runFifoTest(config, rank, worldSize, localRank);
}
}
static void printUsage(char* argv0) {
std::stringstream ss;
ss << "Usage: " << argv0 << " [OPTIONS]\n"
<< "\n"
<< "Options:\n"
<< " -o, --output-format FORMAT Output format: human or json (default: human)\n"
<< " -f, --output-file FILE JSON output file path (default: report.jsonl)\n"
<< " -v, --verbose Increase verbosity\n"
<< " -h, --help Show this help message\n";
std::cout << ss.str();
}
int main(int argc, char* argv[]) {
std::string outputFormat = "human";
std::string outputFile = "report.jsonl";
bool verbose = false;
static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'},
{"output-file", required_argument, 0, 'f'},
{"verbose", no_argument, 0, 'v'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}};
int c;
while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) {
switch (c) {
case 'o':
outputFormat = optarg;
break;
case 'f':
outputFile = optarg;
break;
case 'v':
verbose = true;
break;
case 'h':
printUsage(argv[0]);
return 0;
default:
printUsage(argv[0]);
return 1;
}
}
std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>> tests = {
{"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}};
int result = utils::runMultipleTests(argc, argv, tests);
if (utils::isMainRank()) {
if (outputFormat == "json") {
utils::writeResultsToFile(outputFile);
} else {
utils::printResults(verbose);
}
}
utils::cleanupMPI();
return result;
}

View File

@@ -1,208 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "framework.hpp"
#include <iomanip>
#include <iostream>
#include <sstream>
#include <stdexcept>
namespace mscclpp {
namespace test {
// Global state for results
static std::vector<TestResult> g_results;
static int g_mpi_rank = 0;
static int g_mpi_size = 1;
static bool g_mpi_initialized = false;
namespace utils {
// Internal MPI helper functions (not exposed in header)
void initializeMPI(int argc, char* argv[]) {
if (g_mpi_initialized) return;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size);
g_mpi_initialized = true;
}
static void finalizeMPI() {
if (!g_mpi_initialized) return;
MPI_Finalize();
g_mpi_initialized = false;
}
static int getMPIRank() { return g_mpi_rank; }
static int getMPISize() { return g_mpi_size; }
static bool isMainProcess() { return g_mpi_rank == 0; }
// Public utility functions for test output
bool isMainRank() { return g_mpi_rank == 0; }
void cleanupMPI() { finalizeMPI(); }
std::string getCurrentTimestamp() {
auto now = std::chrono::system_clock::now();
auto time_t = std::chrono::system_clock::to_time_t(now);
std::stringstream ss;
ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S");
return ss.str();
}
void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
const std::map<std::string, std::string>& test_params) {
TestResult result;
result.test_name = test_name;
result.test_category = test_category;
result.test_params = test_params;
result.metrics = metrics;
result.num_processes = g_mpi_size;
result.process_rank = g_mpi_rank;
result.timestamp = getCurrentTimestamp();
g_results.push_back(result);
}
void writeResultsToFile(const std::string& filename) {
std::ofstream file(filename);
if (!file) {
throw std::runtime_error("Cannot open output file: " + filename);
}
for (const auto& result : g_results) {
nlohmann::ordered_json j;
j["test_name"] = result.test_name;
j["test_category"] = result.test_category;
j["test_config"] = result.test_params;
j["metrics"] = result.metrics;
j["num_processes"] = result.num_processes;
j["process_rank"] = result.process_rank;
j["timestamp"] = result.timestamp;
file << j.dump() << std::endl;
}
}
void printResults(bool verbose) {
if (!isMainProcess()) return;
std::cout << "\n=== Test Results ===" << std::endl;
for (const auto& result : g_results) {
std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl;
if (verbose && !result.test_params.empty()) {
std::cout << " Parameters:" << std::endl;
for (const auto& param : result.test_params) {
std::cout << " " << param.first << ": " << param.second << std::endl;
}
}
std::cout << " Metrics:" << std::endl;
for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) {
std::cout << " " << it.key() << ": " << it.value() << std::endl;
}
}
std::cout << std::endl;
}
// Timer implementation
Timer::Timer() : is_running_(false) {}
void Timer::start() {
start_time_ = std::chrono::high_resolution_clock::now();
is_running_ = true;
}
void Timer::stop() {
end_time_ = std::chrono::high_resolution_clock::now();
is_running_ = false;
}
double Timer::elapsedMicroseconds() const {
if (is_running_) {
auto now = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(now - start_time_).count();
}
return std::chrono::duration_cast<std::chrono::microseconds>(end_time_ - start_time_).count();
}
double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; }
double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; }
void cudaCheck(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
std::string msg =
std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err);
throw std::runtime_error(msg);
}
}
int runMultipleTests(
int argc, char* argv[],
const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests) {
int totalResult = 0;
// Initialize MPI once for all tests
initializeMPI(argc, argv);
try {
// Get MPI information
int rank = getMPIRank();
int size = getMPISize();
int local_rank = rank; // For simplicity, assume local_rank = rank
for (const auto& test : tests) {
const std::string& testName = std::get<0>(test);
const std::string& testDescription = std::get<1>(test);
const std::function<void(int, int, int)>& testFunction = std::get<2>(test);
if (rank == 0) {
std::cout << "Running test: " << testName << std::endl;
if (!testDescription.empty()) {
std::cout << " " << testDescription << std::endl;
}
}
// Don't clear results - accumulate them for all tests in the same file
// g_results.clear(); // Commented out to accumulate results
try {
// Run the individual test function with MPI information
testFunction(rank, size, local_rank);
// Synchronize before moving to next test
MPI_Barrier(MPI_COMM_WORLD);
} catch (const std::exception& e) {
if (rank == 0) {
std::cerr << "Error in test " << testName << ": " << e.what() << std::endl;
}
totalResult = 1;
}
}
// Don't cleanup MPI here - let the caller handle it
// finalizeMPI();
} catch (const std::exception& e) {
if (g_mpi_rank == 0) {
std::cerr << "Error: " << e.what() << std::endl;
}
finalizeMPI();
return 1;
}
return totalResult;
}
} // namespace utils
} // namespace test
} // namespace mscclpp

View File

@@ -1,80 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_
#include <mpi.h>
#include <chrono>
#include <fstream>
#include <functional>
#include <map>
#include <mscclpp/gpu.hpp>
#include <nlohmann/json.hpp>
#include <string>
#include <tuple>
#include <vector>
namespace mscclpp {
namespace test {
// Test result structure
struct TestResult {
std::string test_name;
std::string test_category;
std::map<std::string, std::string> test_params;
nlohmann::ordered_json metrics;
int num_processes;
int process_rank;
std::string timestamp;
};
// Simple utility functions for testing
namespace utils {
// Test execution utilities
int runMultipleTests(
int argc, char* argv[],
const std::vector<std::tuple<std::string, std::string, std::function<void(int, int, int)>>>& tests);
// MPI management
void initializeMPI(int argc, char* argv[]);
void cleanupMPI();
bool isMainRank();
// Result recording
void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics,
const std::map<std::string, std::string>& test_params = {});
// Output utilities
void writeResultsToFile(const std::string& filename);
void printResults(bool verbose = false);
void cleanupMPI();
// Timing utilities
class Timer {
public:
Timer();
void start();
void stop();
double elapsedMicroseconds() const;
double elapsedMilliseconds() const;
double elapsedSeconds() const;
private:
std::chrono::high_resolution_clock::time_point start_time_;
std::chrono::high_resolution_clock::time_point end_time_;
bool is_running_;
};
// CUDA utilities
void cudaCheck(cudaError_t err, const char* file, int line);
#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__)
} // namespace utils
} // namespace test
} // namespace mscclpp
#endif // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_

View File

@@ -1,11 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
target_sources(unit_tests PRIVATE
unit_tests_main.cc
core_tests.cc
gpu_utils_tests.cc
errors_tests.cc
fifo_tests.cu
fifo_perf_tests.cu
numa_tests.cc
socket_tests.cc
utils_tests.cc

View File

@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT License.
#include <gtest/gtest.h>
#include "../framework.hpp"
#undef NDEBUG
#ifndef DEBUG_BUILD

View File

@@ -1,12 +1,14 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gmock/gmock.h>
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/core.hpp>
class LocalCommunicatorTest : public ::testing::Test {
#include "../framework.hpp"
// TODO: TransportFlags needs operator<< for EXPECT_EQ to work
// Using ASSERT_TRUE with manual comparisons as workaround
class LocalCommunicatorTest : public ::mscclpp::test::TestCase {
protected:
void SetUp() override {
bootstrap = std::make_shared<mscclpp::TcpBootstrap>(0, 1);
@@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test {
std::shared_ptr<mscclpp::Communicator> comm;
};
TEST_F(LocalCommunicatorTest, RegisterMemory) {
TEST(LocalCommunicatorTest, RegisterMemory) {
int dummy[42];
auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
EXPECT_EQ(memory.data(), &dummy);
EXPECT_EQ(memory.size(), sizeof(dummy));
EXPECT_EQ(memory.transports(), mscclpp::NoTransports);
ASSERT_TRUE(memory.transports() == mscclpp::NoTransports);
}
TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
TEST(LocalCommunicatorTest, SendMemoryToSelf) {
int dummy[42];
auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports);
comm->sendMemory(memory, 0);
@@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) {
auto sameMemory = memoryFuture.get();
EXPECT_EQ(sameMemory.data(), memory.data());
EXPECT_EQ(sameMemory.size(), memory.size());
EXPECT_EQ(sameMemory.transports(), memory.transports());
ASSERT_TRUE(sameMemory.transports() == memory.transports());
}

View File

@@ -1,30 +1,33 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/errors.hpp>
#include "../framework.hpp"
// TODO: ErrorCode needs operator<< for EXPECT_EQ to work
// Using ASSERT_TRUE with manual comparisons as workaround
TEST(ErrorsTest, SystemError) {
mscclpp::Error error("test", mscclpp::ErrorCode::SystemError);
EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError);
ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError);
EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)"));
}
TEST(ErrorsTest, InternalError) {
mscclpp::Error error("test", mscclpp::ErrorCode::InternalError);
EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError);
ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError);
EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)"));
}
TEST(ErrorsTest, InvalidUsage) {
mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage);
EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage);
ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage);
EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)"));
}
TEST(ErrorsTest, Timeout) {
mscclpp::Error error("test", mscclpp::ErrorCode::Timeout);
EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout);
ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout);
EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)"));
}

View File

@@ -0,0 +1,85 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <cassert>
#include <memory>
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <unordered_map>
#include "../framework.hpp"
// Simple FIFO performance test to be run as part of unit_tests
// This is a performance test that can be excluded from coverage runs
// using the --exclude-perf-tests flag.
constexpr uint64_t TIMEOUT_SPINS = 1000000;
constexpr int MIN_TRIGGERS = 100; // Reduced for faster unit test execution
__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle;
__global__ void kernelFifoPerfPush(size_t numTriggers) {
mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
mscclpp::ProxyTrigger trigger;
for (size_t i = 1; i <= numTriggers; ++i) {
trigger.fst = i;
trigger.snd = tid ^ i;
fifo.push(trigger);
}
}
static bool consumePerfTriggers(std::unique_ptr<mscclpp::Fifo>& hostFifo, int numTriggers, int parallel) {
int totalTriggers = numTriggers * parallel;
std::unordered_map<int, int> triggerCounts;
for (int i = 0; i < totalTriggers; ++i) {
mscclpp::ProxyTrigger trigger;
uint64_t spin = 0;
do {
trigger = hostFifo->poll();
if (spin++ > TIMEOUT_SPINS) {
return false;
}
} while (trigger.fst == 0 || trigger.snd == 0);
trigger.snd ^= ((uint64_t)1 << (uint64_t)63);
trigger.snd = trigger.snd ^ trigger.fst;
if (triggerCounts[trigger.snd] + 1 != trigger.fst) {
return false; // Validation failed
}
triggerCounts[trigger.snd]++;
hostFifo->pop();
}
return true;
}
PERF_TEST(FifoPerfTest, BasicPerformance) {
int cudaDevice, numaNode;
CUDA_CHECK(cudaGetDevice(&cudaDevice));
numaNode = mscclpp::getDeviceNumaNode(cudaDevice);
mscclpp::numaBind(numaNode);
const int fifoSize = 128;
const int numTriggers = MIN_TRIGGERS;
const int numParallel = 1;
auto hostFifo = std::make_unique<mscclpp::Fifo>(fifoSize);
mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle();
CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle)));
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
// Run kernel
kernelFifoPerfPush<<<numParallel, 1, 0, stream>>>(numTriggers);
CUDA_CHECK(cudaGetLastError());
// Process triggers
bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel);
ASSERT_TRUE(success);
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceSynchronize());
}

View File

@@ -1,13 +1,12 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/fifo.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include <mscclpp/utils.hpp>
#include "../framework.hpp"
#include "utils_internal.hpp"
#define ITER 10000 // should be larger than the FIFO size for proper testing

View File

@@ -1,10 +1,10 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/gpu_utils.hpp>
#include "../framework.hpp"
TEST(GpuUtilsTest, StreamPool) {
auto streamPool = mscclpp::gpuStreamPool();
cudaStream_t s;

View File

@@ -1,13 +1,13 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/port_channel.hpp>
#include <mscclpp/port_channel_device.hpp>
#include "../framework.hpp"
#define MAGIC_CONST 777
__constant__ mscclpp::PortChannelDeviceHandle gPortChannel;

View File

@@ -1,11 +1,11 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/numa.hpp>
#include "../framework.hpp"
TEST(NumaTest, Basic) {
int num;
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num));

View File

@@ -1,11 +1,10 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/utils.hpp>
#include <thread>
#include "../framework.hpp"
#include "socket.h"
#include "utils_internal.hpp"

View File

@@ -0,0 +1,6 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "../framework.hpp"
int main(int argc, char** argv) { return RUN_ALL_TESTS(); }

View File

@@ -1,10 +1,9 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include <gtest/gtest.h>
#include <thread>
#include "../framework.hpp"
#include "utils_internal.hpp"
TEST(UtilsInternalTest, getHostHash) {

View File

@@ -1,12 +1,12 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <gtest/gtest.h>
// Licensed under the MIT License.
#include <mscclpp/errors.hpp>
#include <mscclpp/utils.hpp>
#include <thread>
#include "../framework.hpp"
TEST(UtilsTest, getHostName) {
std::string hostname1 = mscclpp::getHostName(1024, '.');
EXPECT_FALSE(hostname1.empty());