Merge latest multinode branch

This commit is contained in:
Qinghua Zhou
2026-04-21 18:19:40 +00:00
226 changed files with 14140 additions and 4497 deletions

View File

@@ -0,0 +1,93 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: CodeCoverageA100
timeoutInMinutes: 40
pool:
name: msccl-ci
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
- job: CodeCoverageH100
timeoutInMinutes: 40
pool:
name: msccl-ci-h100
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'
- job: CodeCoverageMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
variables:
- group: mscclpp
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942

View File

@@ -1,114 +0,0 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: IntegrationTestRocm
displayName: Integration test ROCm
strategy:
matrix:
rocm6.2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
pool:
name: mscclpp-rocm
container:
image: $[ variables['containerImage'] ]
options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallRcclTest
displayName: Install rccl-test
inputs:
targetType: 'inline'
script: |
git clone https://github.com/ROCm/rccl-tests.git
cd rccl-tests
make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallDep
displayName: Install dependencies
inputs:
targetType: 'inline'
script: |
set -e
git clone https://github.com/Azure/msccl-tools.git
cd msccl-tools
pip3 install .
- task: Bash@3
name: GenerateExectionFiles
displayName: Generate execution files
inputs:
targetType: 'inline'
script: |
set -e
git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users
cd msccl-users
mkdir execution-files
python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json
python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp allReduce test
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceWithExecutionFileTest
displayName: Run mscclpp allReduce with execution file
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \
-x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \
-x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \
-x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \
-b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -41,11 +41,10 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -37,33 +37,6 @@ jobs:
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp-ssh.key
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: Bash@3
displayName: Add HostEntry
inputs:
@@ -77,107 +50,46 @@ jobs:
echo "Entry already exists, nothing to do."
fi
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclit-vmss --resource-group msccl-IT
- template: templates/deploy.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: templates/run-remote-task.yml
parameters:
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
- task: Bash@3
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
- task: Bash@3
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests
- task: Bash@3
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
kill $CHILD_PID
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
- task: Bash@3
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
kill $CHILD_PID
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
- template: templates/stop.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT

View File

@@ -40,11 +40,10 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
- job: NcclTestH100
@@ -61,9 +60,8 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
nvccGencode: "-gencode=arch=compute_90,code=sm_90"

View File

@@ -0,0 +1,47 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: RcclTestMI300X
displayName: Run MSCCLPP over RCCL Test (MI300X)
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/rccl-test.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
gpuArch: gfx942

View File

@@ -0,0 +1,110 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
buildType: Debug
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
buildDisplayName: 'Build with coverage'
buildName: BuildCoverage
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: TestsCoverageNonPerf
displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
export GCOV_PREFIX=/root/mscclpp
export GCOV_PREFIX_STRIP=$STRIP_COUNT
echo "Running unit_tests..."
./build/bin/unit_tests
echo "unit_tests: PASSED"
echo "Running mp_unit_tests -np 2..."
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 2: PASSED"
echo "Running mp_unit_tests -np 4..."
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 4: PASSED"
- template: run-remote-task.yml
parameters:
name: CaptureCoverage
displayName: Capture coverage data with lcov
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
GCOV_TOOL_ARG=""
if [ "${{ parameters.platform }}" = "rocm" ]; then
apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
GCOV_WRAPPER=$(mktemp)
printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
chmod +x "$GCOV_WRAPPER"
GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
fi
lcov --version
LCOV_CAPTURE_ARGS=""
if lcov --help 2>&1 | grep -q "inconsistent"; then
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
fi
lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
if [ ! -s coverage.info ]; then
echo "ERROR: coverage.info was not generated."
exit 1
fi
lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
lcov --list coverage.info
ls -la coverage.info
- task: Bash@3
name: FetchCoverage
displayName: Fetch coverage data from remote VM
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
HOST=$(head -1 ${HOSTFILE})
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UploadCodecov
displayName: Upload coverage to Codecov
inputs:
targetType: 'inline'
script: |
set -e
curl -Os https://cli.codecov.io/latest/linux/codecov
chmod +x codecov
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,151 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
# Build parameters
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
default: ''
- name: buildType
type: string
default: 'Release'
- name: buildTests
type: string
default: 'true'
- name: cmakeArgs
type: string
default: ''
- name: buildName
type: string
default: 'Build'
- name: buildDisplayName
type: string
default: 'Build'
# Deploy parameters
- name: deployArgs
type: string
default: ''
steps:
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
- task: Bash@3
name: EnsureAzureCLI
displayName: Ensure Azure CLI Installed
inputs:
targetType: inline
script: |
set -e
if command -v az >/dev/null 2>&1; then
az version >/dev/null
exit 0
fi
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
# 1. Build
- task: Bash@3
name: ${{ parameters.buildName }}
displayName: ${{ parameters.buildDisplayName }}
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
BUILD_TESTS_ARG=""
if [ "${{ parameters.buildTests }}" = "true" ]; then
BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
fi
GPU_ARCH_ARG=""
if [ -n "${{ parameters.gpuArch }}" ]; then
GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
fi
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
if [ "${{ parameters.platform }}" = "rocm" ]; then
eval CXX=/opt/rocm/bin/hipcc cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_ROCM=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
else
eval cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
fi
make -j
cd ..
pwd > build/BUILD_PREFIX
echo "=== Build artifacts ==="
ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
du -sh build/bin/* 2>/dev/null || true
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 2. Write CMake args for pip install on remote VMs
- task: Bash@3
name: WritePipCmakeArgs
displayName: Write pip CMake args
inputs:
targetType: 'inline'
script: |
set -e
PIP_CMAKE_ARGS=""
if [ -n "${{ parameters.gpuArch }}" ]; then
PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
fi
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
fi
echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
echo "pip CMake args: $(cat pip_cmake_args.txt)"
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 3. Download SSH key + install packages + start VMSS
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
# 4. Deploy test environment
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: ${{ parameters.deployArgs }}
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -1,242 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllGatherTest
displayName: Run mscclpp AllGather test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: SendRecvTest
displayName: Run mscclpp SendRecv test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp AllReduce test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllToAll
displayName: Run mscclpp AllToAll test
inputs:
targetType: 'inline'
script: |
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: CheckPerfNumber
displayName: Check collective primitives performance
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: FifoPerfBenchmark
displayName: FIFO Performance Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
./build/bin/perf/fifo_test"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,76 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: AllGatherTest
displayName: Run mscclpp AllGather test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: SendRecvTest
displayName: Run mscclpp SendRecv test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllReduceTest
displayName: Run mscclpp AllReduce test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllToAll
displayName: Run mscclpp AllToAll test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: CheckPerfNumber
displayName: Check collective primitives performance
remoteScript: |
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
- template: run-remote-task.yml
parameters:
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
remoteScript: |
python3 -m pip install .
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,280 +0,0 @@
# .azure-pipelines/templates/nccl-test.yaml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# sshKeySecureFile the secureFile name for your SSH key
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: nccltest-single-node
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: GenerateExecutionFile
# displayName: Generate execution file
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp/msccl-users; \
# mkdir -p execution-files; \
# cd /root/mscclpp/msccl-users; \
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNcclTests
displayName: Install NCCL Tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllReduceTest
# displayName: Run NCCL AllReduce Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllGatherTest
# displayName: Run NCCL AllGather Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterTest
# displayName: Run NCCL Reduce Scatter Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNccl
displayName: Install NCCL
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl.git; \
cd nccl; \
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterFallbaclkToNcclTest
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,76 @@
# .azure-pipelines/templates/nccl-test.yml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
deployArgs: 'nccltest-single-node'
- template: run-remote-task.yml
parameters:
name: InstallNcclTests
displayName: Install NCCL Tests
remoteScript: |
cd
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: InstallNccl
displayName: Install NCCL
remoteScript: |
LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
if [ -z "$LATEST_TAG" ]; then
echo "Failed to fetch latest NCCL tag"
exit 1
fi
cd
git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
- template: run-remote-task.yml
parameters:
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,63 @@
# .azure-pipelines/templates/rccl-test.yml
# ------------------------------------------------
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# gpuArch GPU architecture (e.g. gfx942)
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: rocm
gpuArch: ${{ parameters.gpuArch }}
buildTests: false
deployArgs: 'single-node-test true rocm'
- template: run-remote-task.yml
parameters:
name: InstallRcclTests
displayName: Install RCCL Tests
remoteScript: |
cd
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
cd rocm-systems
git sparse-checkout init --cone
git sparse-checkout set projects/rccl-tests
git checkout
cd projects/rccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: RunRcclAllGatherTest
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunRcclAllReduceTest
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,27 @@
parameters:
- name: name
type: string
default: ''
- name: displayName
type: string
- name: runRemoteArgs
type: string
default: ''
- name: remoteScript
type: string
- name: workingDirectory
type: string
default: '$(System.DefaultWorkingDirectory)'
steps:
- task: Bash@3
${{ if ne(parameters.name, '') }}:
name: ${{ parameters.name }}
displayName: ${{ parameters.displayName }}
inputs:
targetType: 'inline'
script: |
test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
${{ parameters.remoteScript }}
REMOTE_CMD
workingDirectory: ${{ parameters.workingDirectory }}

View File

@@ -0,0 +1,20 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
steps:
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}

View File

@@ -0,0 +1,42 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: ExecutorTest
displayName: Run executor tests
remoteScript: |
python3 -m pip install .
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,89 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,95 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
deployArgs: 'single-node-test false'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: run-remote-task.yml
parameters:
name: StopContainer
displayName: Stop existing container
runRemoteArgs: '--no-docker --no-log'
remoteScript: |
sudo docker stop mscclpp-test || true
sudo docker rm mscclpp-test || true
- task: Bash@3
displayName: Remove generated SSH key files
inputs:
targetType: 'inline'
script: |
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: BuildWithIb
displayName: Rebuild with IB
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
-DMSCCLPP_BUILD_TESTS=ON \
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: DeployTestEnvWithIb
displayName: Deploy Test Env (with IB build)
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- template: run-remote-task.yml
parameters:
name: PyTestsWithIbBuildDisableIb
displayName: Run pytests (IB build, IB tests disabled)
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,145 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
mkdir -p build && cd build; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
make -j"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: 'inline'
script: |
# set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,57 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,135 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UnitTests
displayName: Run mscclpp unit tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,49 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -37,17 +37,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestWithNpKitH100
@@ -99,21 +96,20 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestNoIBEnv
timeoutInMinutes: 40
timeoutInMinutes: 60
displayName: Test No IB Environment
pool:
name: msccl-ci-h100
@@ -121,15 +117,55 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-no-ib-env.yaml
- template: templates/ut-no-ib-env.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942
- job: UnitTestExecutor
timeoutInMinutes: 60
displayName: Test DSL Executor
pool:
name: msccl-ci-h100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-executor.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'

24
.codecov.yml Normal file
View File

@@ -0,0 +1,24 @@
codecov:
require_ci_to_pass: yes
coverage:
status:
project:
default:
target: 68%
threshold: 1%
patch:
default:
target: 80%
flag_management:
default_rules:
carryforward: true
ignore:
- "test/"
- "examples/"
- "python/"
- "tools/"
- "docs/"
- "docker/"

View File

@@ -25,7 +25,7 @@ For C/C++/CUDA source code:
```
## Formatting
If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files.
If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first.
## Building and Testing
The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions.
@@ -40,10 +40,10 @@ cd ..
For testing after successful build:
```bash
# To run all tests
# To run tests with two GPUs - two is enough for most tests
mpirun -np 2 ./build/bin/mp_unit_tests
# To run tests excluding IB-related ones (when IB is not available)
mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
```
For building a Python package:
@@ -51,6 +51,12 @@ For building a Python package:
python3 -m pip install -e .
```
For Python tests after building the package:
```bash
# Run tests with 8 GPUs - adjust the number as needed
mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx
```
For building documentation (see dependencies in `docs/requirements.txt`):
```bash
cd docs

View File

@@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- name: Checkout repository
@@ -51,7 +51,7 @@ jobs:
df -h
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
@@ -62,11 +62,11 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}/version:${{matrix.version}}"
@@ -96,7 +96,7 @@ jobs:
df -h
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
@@ -107,10 +107,10 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
make -j
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}/version:${{matrix.version}}"

View File

@@ -1,69 +0,0 @@
name: IntegrationTest
on: workflow_dispatch
jobs:
IntegrationTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: Run mscclpp AllGather test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl

View File

@@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- uses: actions/checkout@v4

View File

@@ -1,52 +0,0 @@
name: UnitTest
on: workflow_dispatch
jobs:
UnitTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
timeout-minutes: 30
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}
- name: LockGPUClock
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: UnitTests
run: |
./build/bin/unit_tests
- name: MpUnitTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- name: PyTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x

7
.gitignore vendored
View File

@@ -1,10 +1,9 @@
.vscode/
.hypothesis/
build/
dist/
build_coverage/
__pycache__
.*.swp
.idea/
*.so
.pytest_cache/
_codeql_detected_source_root
docs/_static/versions.js
_codeql_detected_source_root

View File

@@ -1,5 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CXX)
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
# Options
option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
option(MSCCLPP_BUILD_TESTS "Build tests" ON)
option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
@@ -56,6 +56,8 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
if(MSCCLPP_BYPASS_GPU_CHECK)
@@ -98,6 +100,62 @@ else()
message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
endif()
endif()
# Code coverage setup
if(MSCCLPP_ENABLE_COVERAGE)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
message(STATUS "Code coverage enabled")
# Add coverage flags to C++ targets only (not CUDA)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
# Find lcov
find_program(LCOV_PATH lcov)
if(NOT LCOV_PATH)
message(WARNING "lcov not found. Install lcov to generate coverage reports.")
endif()
if(LCOV_PATH)
# Add coverage target
add_custom_target(coverage
COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
COMMAND ${LCOV_PATH} --directory . --zerocounters
COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Generating code coverage report"
)
# Add coverage clean target
add_custom_target(coverage-clean
COMMAND ${CMAKE_COMMAND} -E remove coverage.info
COMMAND ${LCOV_PATH} --directory . --zerocounters
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Cleaning coverage data"
)
endif()
else()
message(WARNING "Code coverage is only supported with GCC or Clang compilers")
endif()
endif()
if(MSCCLPP_GPU_ARCHS)
string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
@@ -166,12 +224,35 @@ if(MSCCLPP_USE_IB)
if(NOT IBVERBS_FOUND)
message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
endif()
find_package(MLX5)
if(MLX5_FOUND)
message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
else()
message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
endif()
endif()
find_package(NUMA REQUIRED)
find_package(Threads REQUIRED)
option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
if(MSCCLPP_USE_ROCM)
set(MSCCLPP_USE_GDRCOPY OFF)
endif()
if(MSCCLPP_USE_GDRCOPY)
find_package(GDRCopy)
if(NOT GDRCOPY_FOUND)
message(STATUS "GDRCopy not found, disabling GDRCopy support")
set(MSCCLPP_USE_GDRCOPY OFF)
else()
message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
endif()
endif()
include(FetchContent)
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
FetchContent_Declare(json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.12.0
)
FetchContent_MakeAvailable(json)
if("${INSTALL_PREFIX}" STREQUAL "")

View File

@@ -3,13 +3,16 @@
[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
| Testing Pipelines | Build Status |
|--------------------------|-------------------|
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) |
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| NCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
| RCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
A GPU-driven communication stack for scalable AI applications.

View File

@@ -1 +1 @@
0.8.0
0.9.0

50
cmake/FindGDRCopy.cmake Normal file
View File

@@ -0,0 +1,50 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
#
# The following variables are optionally searched for defaults
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
# The following are set after configuration is done:
# GDRCOPY_FOUND
# GDRCOPY_INCLUDE_DIRS
# GDRCOPY_LIBRARIES
find_path(GDRCOPY_INCLUDE_DIRS
NAMES gdrapi.h
HINTS
${GDRCOPY_INCLUDE_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/include
/usr/local/include
/usr/include)
find_library(GDRCOPY_LIBRARIES
NAMES gdrapi
HINTS
${GDRCOPY_LIB_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/lib
/usr/local/lib
/usr/lib
/usr/lib/x86_64-linux-gnu)
if(GDRCOPY_INCLUDE_DIRS)
include(CheckSymbolExists)
set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
unset(CMAKE_REQUIRED_LIBRARIES)
unset(CMAKE_REQUIRED_INCLUDES)
if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
endif()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)

38
cmake/FindMLX5.cmake Normal file
View File

@@ -0,0 +1,38 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Find the MLX5 Direct Verbs (mlx5dv) library
#
# The following variables are optionally searched for defaults
# MLX5_ROOT_DIR: Base directory where all MLX5 components are found
# MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
# MLX5_LIB_DIR: Directory where MLX5 libraries are found
# The following are set after configuration is done:
# MLX5_FOUND
# MLX5_INCLUDE_DIRS
# MLX5_LIBRARIES
find_path(MLX5_INCLUDE_DIRS
NAMES infiniband/mlx5dv.h
HINTS
${MLX5_INCLUDE_DIR}
${MLX5_ROOT_DIR}
${MLX5_ROOT_DIR}/include
/usr/local/include
/usr/include)
find_library(MLX5_LIBRARIES
NAMES mlx5
HINTS
${MLX5_LIB_DIR}
${MLX5_ROOT_DIR}
${MLX5_ROOT_DIR}/lib
/usr/local/lib
/usr/lib
/usr/lib/x86_64-linux-gnu)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)

View File

@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install lcov 2.2
RUN LCOV_VERSION="2.2" && \
apt-get update && \
apt-get install -y --no-install-recommends \
cpanminus \
gcc \
make \
perl \
&& \
cpanm --notest \
Capture::Tiny \
DateTime \
JSON::XS \
Memory::Process \
TimeDate \
&& \
cd /tmp && \
curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
tar xzf lcov.tar.gz && \
cd lcov-${LCOV_VERSION} && \
make install && \
cd / && rm -rf /tmp/lcov* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install CMake 3.26.4
RUN OS_ARCH=$(uname -m) && \
CMAKE_VERSION="3.26.4" && \
@@ -24,6 +49,33 @@ RUN OS_ARCH=$(uname -m) && \
rm -rf ${CMAKE_HOME}.tar.gz && \
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
# Install GDRCopy userspace library for CUDA targets
ARG TARGET="cuda13.0"
RUN if echo "$TARGET" | grep -q "^cuda"; then \
GDRCOPY_VERSION="2.5.2" && \
apt-get update -y && \
apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
cd /tmp && \
curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
tar xzf gdrcopy.tar.gz && \
cd gdrcopy-${GDRCOPY_VERSION}/packages && \
./build-deb-packages.sh -k -t && \
dpkg -i libgdrapi_*.deb && \
cd / && rm -rf /tmp/gdrcopy* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*; \
fi
# Install ROCm-specific packages if building for ROCm
RUN if echo "$TARGET" | grep -q "^rocm"; then \
apt-get update -y && \
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*; \
fi
# Create Python venv
RUN python3 -m venv /root/venv && \
echo 'source /root/venv/bin/activate' >> /root/.bashrc
@@ -32,10 +84,13 @@ ENV PATH="/root/venv/bin:${PATH}"
# Install Python dependencies
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda13.0"
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
if echo "$TARGET" | grep -q "^rocm"; then \
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
fi && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r python/requirements_${target_type}.txt
pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
pip install --no-cache-dir coverage xlsxwriter
# Cleanup
RUN rm -rf /tmp/mscclpp

View File

@@ -1,19 +0,0 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
ENV RCCL_VERSION=rocm-6.2.0
ARG GPU_ARCH=gfx942
ENV ARCH_TARGET=${GPU_ARCH}
RUN cd /tmp && \
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
cd rccl && \
./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \
cd .. && \
rm -rf /tmp/rccl
WORKDIR /

View File

@@ -4,38 +4,39 @@ set -e
declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
)
declare -A extraLdPathTable
extraLdPathTable=(
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
["cuda11.8"]="/usr/local/cuda-11.8/compat"
["cuda12.4"]="/usr/local/cuda-12.4/compat"
["cuda12.8"]="/usr/local/cuda-12.8/compat"
["cuda12.9"]="/usr/local/cuda-12.9/compat"
["cuda13.0"]="/usr/local/cuda-13.0/compat"
["rocm6.2"]="/opt/rocm/lib"
)
declare -A ofedVersionTable
ofedVersionTable=(
["cuda11.8"]="23.07-0.5.1.2"
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
["cuda13.0"]="24.10-3.2.5.0"
["rocm6.2"]="24.10-1.1.4.0"
)
TARGET=${1}
OS_ARCH=$(uname -m)
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
@@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \
if [[ ${TARGET} == rocm* ]]; then
echo "Building ROCm base image..."
docker build -t ${TAG_BASE} \
-f docker/base-x-rocm.dockerfile \
--build-arg BASE_IMAGE=${TAG_TMP} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg GPU_ARCH="gfx942" .
docker rmi ${TAG_TMP}
else
echo "Building CUDA base image..."
docker tag ${TAG_TMP} ${TAG_BASE}
docker rmi --no-prune ${TAG_TMP}
fi
docker tag ${TAG_TMP} ${TAG_BASE}
docker rmi --no-prune ${TAG_TMP}
docker build -t ${TAG_BASE_DEV} \
-f docker/base-dev-x.dockerfile \

View File

@@ -5,7 +5,7 @@
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SPHINXMULTIVERSION ?= sphinx-multiversion
SPHINXMULTIVERSION ?= python3 build_multiversion.py
SOURCEDIR = .
BUILDDIR = _build

View File

@@ -26,27 +26,53 @@
* @returns {string} The base path (e.g., '/mscclpp' or '')
*/
function detectBasePath() {
const path = window.location.pathname;
// Match pattern: /base-path/vX.Y.Z/... or /base-path/main/...
// The base path is everything before the version or main directory
const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/);
if (match && match[1]) {
return match[1];
}
// Check if we're at a root that's actually a project site
// Look for common indicators like the repository name in the path
const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/);
if (projectMatch) {
// Verify this isn't a version path at root
const potentialBase = projectMatch[1];
if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') {
// Check if the remaining path contains version info
const remainingPath = path.substring(potentialBase.length);
if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) {
return potentialBase;
// Most reliable method: detect from this script's own URL
// The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js
const scripts = document.getElementsByTagName('script');
for (let i = 0; i < scripts.length; i++) {
const src = scripts[i].src;
if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) {
try {
const url = new URL(src);
const scriptPath = url.pathname;
// Extract base path: everything before /_static/version-selector.js
// But also strip version directories like /v0.8.0/ or /main/
const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/);
if (match) {
let basePath = match[1] || '';
// Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp)
basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, '');
return basePath;
}
} catch (e) {
// URL parsing failed, continue to fallback
// Log a warning to aid debugging when the primary detection method fails.
if (typeof console !== 'undefined' && typeof console.warn === 'function') {
console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e);
}
}
}
}
// Fallback: try to detect from URL path
const path = window.location.pathname;
const segments = path.split('/').filter(s => s.length > 0);
if (segments.length >= 1) {
const firstSegment = segments[0];
// If first segment is not a version tag (vX.Y.Z), not 'main', and
// does not look like a file name (no '.' in the segment), then it's
// the GitHub Pages project base path (e.g., 'mscclpp').
// This handles both:
// /mscclpp/v0.8.0/index.html -> base is /mscclpp
// /mscclpp/index.html -> base is /mscclpp
// while avoiding treating root files like /index.html as a base path.
if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) {
return '/' + firstSegment;
}
}
// No base path (root site or local development)
return '';
}

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Wrapper around sphinx-multiversion that patches copy_tree to generate
_version.py in each tag checkout. This is needed because setuptools_scm
generates _version.py at build time, but sphinx-multiversion uses
`git archive` which only contains committed files.
Usage (called by Makefile):
python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
"""
import os
import re
import subprocess
import sys
import sphinx_multiversion.git as smv_git
from sphinx_multiversion import main as smv_main
# Save the original copy_tree
_original_copy_tree = smv_git.copy_tree
def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
"""Call original copy_tree, then generate _version.py from the VERSION file."""
_original_copy_tree(gitroot, src, dst, reference, sourcepath)
# Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
refname = getattr(reference, "refname", "") or ""
match = re.search(r"v(\d+\.\d+\.\d+)", refname)
if not match:
return
version = match.group(1)
version_py_dir = os.path.join(dst, "python", "mscclpp")
if os.path.isdir(version_py_dir):
version_py = os.path.join(version_py_dir, "_version.py")
if not os.path.exists(version_py):
with open(version_py, "w") as f:
f.write(f'__version__ = "{version}"\n')
# Monkey-patch
smv_git.copy_tree = _patched_copy_tree
if __name__ == "__main__":
sys.exit(smv_main(sys.argv[1:]))

View File

@@ -11,6 +11,18 @@
import sys
import importlib.util
from pathlib import Path
from unittest.mock import MagicMock
class NamedMock(MagicMock):
def __getattr__(self, name):
attr = super().__getattr__(name)
if isinstance(attr, MagicMock):
# Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection.
attr.__name__ = name
attr.__qualname__ = name
return attr
# Add the python package to sys.path so Sphinx can find it
project_root = Path(__file__).parent.parent
@@ -63,7 +75,7 @@ autodoc_default_options = {
"show-inheritance": True,
}
# only mock the C-extension when using the source tree
autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
autodoc_typehints = "description"
napoleon_google_docstring = True
napoleon_numpy_docstring = True
@@ -71,6 +83,10 @@ intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"numpy": ("https://numpy.org/doc/stable/", None),
}
mock_mscclpp = NamedMock()
# Set attributes to satisfy Sphinx autodoc inspection.
mock_mscclpp.env.return_value.cache_dir = "_mscclpp"
sys.modules["mscclpp._mscclpp"] = mock_mscclpp
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

View File

@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
python3 -m mscclpp --install
```
This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
## Your First Algorithm: AllGather
Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.

View File

@@ -56,9 +56,12 @@ python3 -m mscclpp --install
After installation, the generated JSON execution plan can be found at:
```
~/.cache/mscclpp_default/
~/.cache/mscclpp/default/
```
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
**Performance Results:**
The figure below shows the performance characteristics for small message sizes in a two-node configuration:

View File

@@ -129,7 +129,7 @@ class CustomizedComm:
self._algo_large = [
algo for algo in algorithms
if algo.collective == "allreduce"
and algo.name == "default_allreduce_nvls_with_copy"
and algo.name == "default_allreduce_nvls_warp_pipeline"
][0]
def all_reduce(self, tensor: torch.Tensor, stream=None):
@@ -332,7 +332,8 @@ public:
size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, mscclpp::ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) {
const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
},
// Context initialization function
@@ -343,8 +344,8 @@ public:
},
// Context key generation function
[self](const void* input, void* output,
size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateContextKey(input, output, inputSize, outputSize, dtype);
size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) {
return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
}
);
}
@@ -468,3 +469,196 @@ stream_handle = torch.cuda.current_stream().cuda_stream
All examples are in [`examples/torch-integration/`](../../examples/torch-integration/).
---
## Performance Tuning
The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime.
**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py)
### How It Works
1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms:
- All sizes (when NVLS is supported): `default_allreduce_nvls_zero_copy`
- Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet`
- Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy`
2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed.
3. **Cross-rank consensus** — Elapsed times are averaged across all ranks with an allreduce so every GPU selects the same configuration.
4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple.
### Symmetric Memory Allocation
Algorithms like `default_allreduce_nvls_zero_copy` require **symmetric memory** — memory where the buffer offset is the same for each rank, allocated via `mscclpp.RawGpuBuffer` (`cuMemAlloc`). Regular `torch.rand()` or `torch.empty()` allocations cannot be used with these algorithms because they do not guarantee the same offset across ranks. Instead, allocate a single large buffer and reuse it for all message sizes:
```python
# Allocate symmetric memory via RawGpuBuffer and wrap as a PyTorch tensor
tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
tune_tensor.normal_()
```
When executing an algorithm with symmetric memory, pass `symmetric_memory=True`:
```python
def _run_algo(self, algo, tensor, size, nblocks, nthreads):
return algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=size,
output_size=size,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=mscclpp.ReduceOp.SUM,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nblocks,
nthreads_per_block=nthreads,
symmetric_memory=True,
)
```
### Loading Candidate Algorithms
The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects:
```python
algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
self._algorithm_nvls_packet = [
algo for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
][0]
self._algorithm_rsag_zero_copy = [
algo for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
][0]
self._algorithm_packet = [
algo for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
][0]
# NVLS zero-copy is only available on supported hardware
if mscclpp.is_nvls_supported():
self._algorithm_nvls_zero_copy = [
algo for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
][0]
```
### The Tuning Loop
The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing. Note the use of `RawGpuBuffer` for symmetric memory:
```python
def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
sizes = [1 << i for i in range(10, 28)]
self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
# Use RawGpuBuffer for symmetric memory allocation
tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
tune_tensor.normal_()
candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
candidates_nthreads = [512, 768, 1024]
for size in sizes:
algos = []
if mscclpp.is_nvls_supported():
algos.append(self._algorithm_nvls_zero_copy)
if size <= 4 * 1024 * 1024:
algos.append(self._algorithm_nvls_packet)
algos.append(self._algorithm_packet)
if size >= 512 * 1024:
algos.append(self._algorithm_rsag_zero_copy)
best_time = float("inf")
best_config = None
for algo in algos:
for nb in candidates_nblocks:
for nt in candidates_nthreads:
if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
continue # skip unsupported configs
# Warmup, then time with CUDA graphs
# ... (see full example for graph capture logic)
# Average timing across ranks
time_tensor = torch.full(
(self.world_size,), elapsed, dtype=torch.float64, device="cuda"
).to(dtype=torch.float32)
self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
avg_time = time_tensor[self.rank].item() / self.world_size
if avg_time < best_time:
best_time = avg_time
best_config = (algo, nb, nt)
if best_config:
self.best_configs[size] = best_config
```
### Dispatching with Tuned Configuration
At runtime, round the message size to the next power of two and look up the best configuration. When the tensor is allocated from `RawGpuBuffer` (`cuMemAlloc`) and the buffer offset is the same for each rank, pass `symmetric_memory=True` to the `execute()` call (see the [Symmetric Memory Allocation](#symmetric-memory-allocation) section above):
```python
def get_tuned_config(self, size):
if size < 1024:
target_size = 1024
elif size > 256 * 1024 * 1024:
target_size = 256 * 1024 * 1024
else:
target_size = 1 << (size - 1).bit_length()
return self.best_configs.get(target_size)
def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None):
config = self.get_tuned_config(tensor.nbytes)
algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=mscclpp.ReduceOp.SUM,
stream=stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream,
nblocks=nblocks,
nthreads_per_block=nthreads,
)
```
### Benchmarking with Symmetric Memory
When benchmarking tuned configurations, use the same `RawGpuBuffer` allocation pattern. Create one large buffer and slice it for each message size:
```python
def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
# Allocate a single large RawGpuBuffer (symmetric memory) and reuse for all sizes
dtype = torch.float16
bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
bench_buf.normal_()
for size in sizes:
n_elements = size // bench_buf.element_size()
tensor = bench_buf[:n_elements]
# Capture CUDA graph, warmup, and time...
with torch.cuda.graph(g, stream=capture_stream):
for _ in range(n_iter_per_graph):
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
```
### Running the Tuning Example
```bash
MSCCLPP_MASTER_ADDR=<ip> MSCCLPP_MASTER_PORT=<port> \
torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
```

View File

@@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API.
:toctree: py_api
:recursive:
mscclpp.comm
mscclpp.utils
mscclpp.language
mscclpp

View File

@@ -31,6 +31,9 @@
```
If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
* (Optional, for benchmarks) MPI
* (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
* GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
* Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
* Others
* For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
* For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
@@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
```bash
# For NVIDIA platforms
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
# For AMD platforms
$ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
```
@@ -171,7 +174,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
@@ -189,14 +191,12 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features.

View File

@@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes);
mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport);
```
Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials.
Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials.
**GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope.
@@ -251,6 +251,37 @@ columns 2
Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed.
(mc-cross-node)=
## Cross-node Execution
For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms.
We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL.
Run the program on two nodes with command line arguments:
```
./bidir_memory_channel [<ip_port> <rank> <gpu_id>]
```
For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally.
On Node 0 (Rank 0):
```bash
$ ./bidir_memory_channel 192.168.0.1:50000 0 0
```
On Node 1 (Rank 1):
```bash
$ ./bidir_memory_channel 192.168.0.1:50000 1 0
```
You should see output indicating successful data transfer.
```{tip}
If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`).
```
## Summary and Next Steps
In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`.

View File

@@ -101,15 +101,17 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
static_cast<ncclDataType_t>(dtype));
static_cast<ncclDataType_t>(dtype), symmetricMemory);
});
return allgatherAlgo;
}
@@ -191,7 +193,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, ncclDataType_t dtype) {
size_t outputSize, ncclDataType_t dtype, bool) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};

View File

@@ -69,14 +69,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize,
mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
[self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype,
bool symmetricMemory) {
return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory);
});
return allgatherAlgo;
}
@@ -159,7 +161,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
}
mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
size_t outputSize, mscclpp::DataType dtype) {
size_t outputSize, mscclpp::DataType dtype, bool) {
return {(void*)input, output, inputSize, outputSize, 0};
}
};

View File

@@ -15,7 +15,9 @@ import ipaddress
def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
return collection_builder.build_default_algorithms(
scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
scratch_buffer=scratch_buffer.data_ptr(),
scratch_buffer_size=scratch_buffer.nbytes,
rank=rank,
)
@@ -59,7 +61,7 @@ class CustomizedComm:
self._algorithm_nvls_nonzero_copy = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline"
][0]
def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):

View File

@@ -0,0 +1,476 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
import os
import ipaddress
import netifaces as ni
import torch
import mscclpp
import mscclpp.ext
import mscclpp.utils as mscclpp_utils
# -- Helpers ------------------------------------------------------------------
def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
"""Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
# PyTorch's from_dlpack does not support certain float8 DLPack type codes.
# Work around by importing as uint8 and reinterpreting via .view().
_DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
if dtype in _DLPACK_UNSUPPORTED:
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
return torch.utils.dlpack.from_dlpack(dlpack)
def _load_algorithms(scratch: torch.Tensor, rank: int):
return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
scratch_buffer=scratch.data_ptr(),
scratch_buffer_size=scratch.nbytes,
rank=rank,
)
def _interfaces_for_ip(ip: str):
target = ipaddress.ip_address(ip)
for iface in ni.interfaces():
addrs = ni.ifaddresses(iface)
if ni.AF_INET in addrs:
for link in addrs[ni.AF_INET]:
if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
return iface
return None
def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
if op == torch.distributed.ReduceOp.SUM:
return mscclpp.ReduceOp.SUM
if op == torch.distributed.ReduceOp.MIN:
return mscclpp.ReduceOp.MIN
raise ValueError(f"unsupported op: {op}")
def _round_pow2(size: int) -> int:
"""Round up to next power-of-2, clamped to [1024, 256 MB]."""
size = max(size, 1024)
size = min(size, 256 << 20)
return 1 << (size - 1).bit_length()
# -- CustomizedComm -----------------------------------------------------------
class CustomizedComm:
"""Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
_TUNE_N_WARMUP = 5
_TUNE_N_GRAPH_LAUNCHES = 10
_TUNE_N_OPS_PER_GRAPH = 100
_CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
_CANDIDATE_NTHREADS = [512, 768, 1024]
_NBLOCKS_LIMIT = {
"default_allreduce_nvls_packet": 16,
"default_allreduce_packet": 56,
"default_allreduce_allpair_packet": 56,
"default_allreduce_fullmesh": 64,
"default_allgather_fullmesh2": 32,
}
def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
self.comm = comm
self.rank = comm.my_rank
self.world_size = comm.nranks
self.symmetric_memory = symmetric_memory
self._nvls = mscclpp.is_nvls_supported()
self._scratch = _make_tensor(1 << 27, torch.float16)
self._barrier_tensor = _make_tensor(4096, torch.float32)
algos = _load_algorithms(self._scratch, self.rank)
self._algos = {(a.collective, a.name): a for a in algos}
# {collective: {rounded_size: (algo, nblocks, nthreads)}}
self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
self._tune_buf = None
self._time_buf = None
def _algo(self, collective: str, name: str):
return self._algos.get((collective, name))
def _default_ar_config(self):
"""Fallback allreduce config for barrier / timing sync."""
pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
if self._nvls and pkt:
return (pkt, 0, 0)
return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
# -- low-level execute --
def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
ret = algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=op,
stream=s,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=sym,
accum_dtype=accum_dtype,
)
if ret != 0:
print(f"Rank {self.rank}: {algo.name} failed ({ret})")
return ret
def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
if sym is None:
sym = self.symmetric_memory
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
ret = algo.execute(
comm=self.comm.communicator,
input_buffer=inp.data_ptr(),
output_buffer=out.data_ptr(),
input_size=inp.nbytes,
output_size=out.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
op=mscclpp.ReduceOp.NOP,
stream=s,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=sym,
)
if ret != 0:
print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
return ret
def _barrier_internal(self):
a, nb, nt = self._default_ar_config()
self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
# -- lazy tuning --
def _ensure_tune_bufs(self):
if self._tune_buf is None:
self._tune_buf = _make_tensor(1 << 27, torch.float16)
self._tune_buf.normal_()
self._time_buf = _make_tensor(4096, torch.float32)
return self._tune_buf
def _ar_candidates(self, size: int):
out = []
if size <= 4 << 20:
a = self._algo("allreduce", "default_allreduce_nvls_packet")
if self._nvls and a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_packet")
if a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_allpair_packet")
if a:
out.append(a)
if size >= 512 << 10:
a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
if self._nvls and self.symmetric_memory and a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
if a:
out.append(a)
if torch.version.hip is not None:
a = self._algo("allreduce", "default_allreduce_fullmesh")
if a:
out.append(a)
return out
def _ag_candidates(self):
a = self._algo("allgather", "default_allgather_fullmesh2")
return [a] if a else []
def _run_tune(self, collective, algo, buf, size, nb, nt):
"""Single tune invocation for either collective."""
if collective == "allreduce":
return algo.execute(
comm=self.comm.communicator,
input_buffer=buf.data_ptr(),
output_buffer=buf.data_ptr(),
input_size=size,
output_size=size,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
op=mscclpp.ReduceOp.SUM,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=True,
)
else:
total = size * self.world_size
out_ptr = buf.data_ptr()
return algo.execute(
comm=self.comm.communicator,
input_buffer=out_ptr + self.rank * size,
output_buffer=out_ptr,
input_size=size,
output_size=total,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
op=mscclpp.ReduceOp.NOP,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=False,
)
def _tune_size(self, collective: str, target_size: int):
"""Auto-tune one (collective, target_size) pair and cache result."""
buf = self._ensure_tune_bufs()
cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
best_time, best_cfg = float("inf"), None
used = set()
run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
for algo in cands:
nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
for nb in self._CANDIDATE_NBLOCKS:
if nb > nb_limit:
continue
for nt in self._CANDIDATE_NTHREADS:
# Feasibility — sync result across ranks so all agree
ret = run(algo, nb, nt)
torch.cuda.synchronize()
self._time_buf[0] = float(ret)
self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
if self._time_buf[0].item() != 0:
continue
used.add(algo)
# Warmup
for _ in range(self._TUNE_N_WARMUP):
run(algo, nb, nt)
# CUDA-graph timed benchmark
cs = torch.cuda.Stream()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(self._TUNE_N_OPS_PER_GRAPH):
run(algo, nb, nt)
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
start.record(cs)
with torch.cuda.stream(cs):
for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
g.replay()
end.record(cs)
end.synchronize()
elapsed = start.elapsed_time(end)
# Cross-rank timing sync
self._time_buf.fill_(elapsed)
torch.cuda.current_stream().wait_stream(cs)
self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
avg = self._time_buf[self.rank].item() / self.world_size
if avg < best_time:
best_time, best_cfg = avg, (algo, nb, nt)
if best_cfg:
self._tune_cache[collective][target_size] = best_cfg
if self.rank == 0:
n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
print(
f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
flush=True,
)
else:
fb = (
self._default_ar_config()
if collective == "allreduce"
else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
)
self._tune_cache[collective][target_size] = fb
torch.cuda.synchronize()
self._barrier_internal()
for a in used:
a.reset()
# -- public API --
def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
sz = _round_pow2(tensor.nbytes)
if sz not in self._tune_cache["allreduce"]:
self._tune_size("allreduce", sz)
a, nb, nt = self._tune_cache["allreduce"][sz]
self._exec_ar(
tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
)
def all_gather(self, output_tensor, input_tensor, stream=None):
sz = _round_pow2(input_tensor.nbytes)
if sz not in self._tune_cache["allgather"]:
self._tune_size("allgather", sz)
a, nb, nt = self._tune_cache["allgather"][sz]
self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
def barrier(self):
self._barrier_internal()
def destroy(self):
self._algos.clear()
self._tune_cache = {"allreduce": {}, "allgather": {}}
self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
# -- Benchmarks (standalone) --------------------------------------------------
def _bench_sizes(low=5 * 1024, high=80 << 20):
sizes, c = [], low
while c <= high:
sizes.append(c)
c *= 2
return sizes
def benchmark_allreduce(
comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
):
sizes = _bench_sizes()
if comm.rank == 0:
print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
cs = torch.cuda.Stream()
buf = _make_tensor(1 << 27, dtype)
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
for size in sizes:
nelems = size // buf.element_size()
t = buf[: size // buf.element_size()]
comm.all_reduce(t, accum_dtype=accum_dtype)
torch.cuda.synchronize()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(n_iter):
comm.all_reduce(t, accum_dtype=accum_dtype)
with torch.cuda.stream(cs):
for _ in range(n_warmup):
g.replay()
comm.barrier()
cs.synchronize()
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
s.record(cs)
with torch.cuda.stream(cs):
for _ in range(n_graph_launches):
g.replay()
e.record(cs)
e.synchronize()
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
if comm.rank == 0:
print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
sizes = _bench_sizes()
if comm.rank == 0:
print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
cs = torch.cuda.Stream()
buf = _make_tensor(1 << 27, dtype)
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
for prs in sizes:
total = prs * comm.world_size
if total > buf.nbytes:
break
nt = total // buf.element_size()
npr = prs // buf.element_size()
out = buf[:nt]
inp = out[comm.rank * npr : (comm.rank + 1) * npr]
comm.all_gather(out, inp)
torch.cuda.synchronize()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(n_iter):
comm.all_gather(out, inp)
with torch.cuda.stream(cs):
for _ in range(n_warmup):
g.replay()
comm.barrier()
cs.synchronize()
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
s.record(cs)
with torch.cuda.stream(cs):
for _ in range(n_graph_launches):
g.replay()
e.record(cs)
e.synchronize()
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
if comm.rank == 0:
print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
# -- Bootstrap & main ---------------------------------------------------------
def init_dist() -> mscclpp.CommGroup:
addr = os.environ.get("MSCCLPP_MASTER_ADDR")
if addr:
rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
port = os.environ["MSCCLPP_MASTER_PORT"]
iface = _interfaces_for_ip(addr)
if not iface:
raise ValueError(f"No interface for {addr}")
return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
import torch.distributed as dist
dist.init_process_group(backend="gloo")
return mscclpp.CommGroup(torch_group=dist.group.WORLD)
def main():
local = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local)
dtype_str = os.environ.get("DTYPE", "float16")
dtype = getattr(torch, dtype_str, torch.float16)
accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
accum_str = os.environ.get("ACCUM_DTYPE")
accum_dtype = accum_map.get(accum_str) if accum_str else None
comm_group = init_dist()
cc = CustomizedComm(comm_group)
print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
cc.barrier()
torch.cuda.synchronize()
benchmark_allgather(cc, dtype=dtype)
cc.barrier()
torch.cuda.synchronize()
cc.destroy()
print(f"rank {local} completed successfully.")
if __name__ == "__main__":
main()

View File

@@ -1,19 +1,20 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
import os
from typing import Any, Dict
import torch, torch.distributed as dist
import mscclpp
import mscclpp.ext
from mscclpp.language.collectives import AllReduce
from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
from mscclpp.language.program import CollectiveProgram
from mscclpp.language.rank import Rank
from mscclpp.language.utils import AlgoSpec
def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
gpu_size = spec.world_size
with CollectiveProgram.from_spec(spec) as program:
# Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
return program
def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
spec = mscclpp.AlgoSpec(
def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
spec = AlgoSpec(
name="allreduce_nvls",
collective=AllReduce(8, 1, True),
nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
local = int(os.environ["LOCAL_RANK"])
algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
setup_plan(algorithm_collection_builder, rank, world)
algorithm_collection_builder.set_algorithm_selector(selector)
dist.init_process_group(backend="nccl", device_id=local)
dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
return rank, world, local

View File

@@ -9,7 +9,7 @@
#include <sstream>
template <typename... Args>
void log(Args &&...args) {
void log(Args&&... args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
@@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
}
}
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
for (int i = 0; i < iter; ++i) {
devHandle->relaxedWait();
@@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
}
}
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
for (int i = 0; i < iter; ++i) {
devHandle->relaxedSignal();
@@ -88,7 +88,7 @@ int main() {
mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1);
mscclpp::BaseMemoryChannel memChan0(sema0);
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle();
void *devHandle0;
void* devHandle0;
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice));
@@ -98,14 +98,14 @@ int main() {
mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0);
mscclpp::BaseMemoryChannel memChan1(sema1);
mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle();
void *devHandle1;
void* devHandle1;
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle)));
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice));
log("GPU 0: Launching gpuKernel0 ...");
MSCCLPP_CUDATHROW(cudaSetDevice(0));
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle0), iter);
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle0), iter);
MSCCLPP_CUDATHROW(cudaGetLastError());
log("GPU 1: Launching gpuKernel1 ...");
@@ -115,7 +115,7 @@ int main() {
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
MSCCLPP_CUDATHROW(cudaEventRecord(start));
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle1), iter);
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle1), iter);
MSCCLPP_CUDATHROW(cudaGetLastError());
MSCCLPP_CUDATHROW(cudaEventRecord(end));
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));

View File

@@ -14,7 +14,7 @@
#define PORT_NUMBER "50505"
template <typename... Args>
void log(Args &&...args) {
void log(Args&&... args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
@@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) {
}
}
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
for (int i = 0; i < iter; ++i) {
devHandle->relaxedWait();
@@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in
}
}
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) {
__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) {
if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
for (int i = 0; i < iter; ++i) {
devHandle->relaxedSignal();
@@ -115,14 +115,14 @@ void worker(int gpuId) {
mscclpp::BaseMemoryChannel memChan(sema);
auto memChanHandle = memChan.deviceHandle();
void *devHandle;
void* devHandle;
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
log("GPU ", gpuId, ": Launching a GPU kernel ...");
if (gpuId == 0) {
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
gpuKernel0<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
MSCCLPP_CUDATHROW(cudaGetLastError());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
} else {
@@ -130,7 +130,7 @@ void worker(int gpuId) {
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
MSCCLPP_CUDATHROW(cudaEventRecord(start));
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle *>(devHandle), iter);
gpuKernel1<<<1, 1>>>(reinterpret_cast<mscclpp::BaseMemoryChannelDeviceHandle*>(devHandle), iter);
MSCCLPP_CUDATHROW(cudaGetLastError());
MSCCLPP_CUDATHROW(cudaEventRecord(end));
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));

View File

@@ -16,7 +16,7 @@
#define PORT_NUMBER "50505"
template <typename... Args>
void log(Args &&...args) {
void log(Args&&... args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
@@ -47,7 +47,7 @@ int wait_process(int pid) {
__device__ mscclpp::DeviceSyncer devSyncer;
__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0) {
devHandle->relaxedSignal();
@@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
}
}
__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0) {
devHandle->relaxedSignal();
@@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si
devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x);
}
__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank,
__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank,
uint32_t flag) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0) {
@@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand
devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag);
}
void worker(int gpuId) {
void worker(int myRank, int gpuId, const std::string& ipPort) {
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
const int myRank = gpuId;
const int remoteRank = myRank == 0 ? 1 : 0;
const int nRanks = 2;
const int iter = 1000;
@@ -105,11 +104,11 @@ void worker(int gpuId) {
const size_t bufferBytes = 256 * 1024 * 1024;
const size_t pktBufferBytes = 256 * 1024 * 1024;
log("GPU ", gpuId, ": Preparing for tests ...");
log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
// Build a connection and a semaphore
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER);
bootstrap->initialize(ipPort);
mscclpp::Communicator comm(bootstrap);
auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get();
auto sema = comm.buildSemaphore(conn, remoteRank).get();
@@ -133,8 +132,8 @@ void worker(int gpuId) {
auto memChanHandle = memChan.deviceHandle();
auto memPktChanHandle = memPktChan.deviceHandle();
void *devHandle;
void *devPktHandle;
void* devHandle;
void* devPktHandle;
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle)));
MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle)));
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice));
@@ -146,23 +145,23 @@ void worker(int gpuId) {
std::function<void(size_t)> kernels[3];
kernels[0] = [&](size_t copyBytes) {
bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
copyBytes, myRank);
bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
myRank);
};
kernels[1] = [&](size_t copyBytes) {
bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devHandle),
copyBytes, myRank);
bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devHandle), copyBytes,
myRank);
};
kernels[2] = [&](size_t copyBytes) {
static uint32_t flag = 1;
bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle *>(devPktHandle),
bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast<mscclpp::MemoryChannelDeviceHandle*>(devPktHandle),
copyBytes, myRank, flag++);
};
cudaEvent_t start, end;
if (gpuId == 0) {
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
}
@@ -189,13 +188,13 @@ void worker(int gpuId) {
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
bootstrap->barrier();
if (gpuId == 0) {
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(start, stream));
}
MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream));
if (gpuId == 0) {
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(end, stream));
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
float elapsedTime;
@@ -204,8 +203,8 @@ void worker(int gpuId) {
MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
elapsedTimePerIter = elapsedTime / iter;
gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f;
log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
gbps, " GB/s");
log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter,
" ms/iter, BW ", gbps, " GB/s");
}
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec));
@@ -216,23 +215,47 @@ void worker(int gpuId) {
bootstrap->barrier();
}
int main() {
int pid0 = spawn_process([]() { worker(0); });
int pid1 = spawn_process([]() { worker(1); });
if (pid0 < 0 || pid1 < 0) {
log("Failed to spawn processes.");
int main(int argc, char** argv) {
if (argc == 1) {
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
if (pid0 < 0 || pid1 < 0) {
log("Failed to spawn processes.");
return -1;
}
int status0 = wait_process(pid0);
int status1 = wait_process(pid1);
if (status0 < 0 || status1 < 0) {
log("Failed to wait for processes.");
return -1;
}
if (status0 != 0 || status1 != 0) {
log("One of the processes failed.");
return -1;
}
log("Succeed!");
return 0;
} else if (argc == 4) {
std::string ipPort = argv[1];
int rank, gpuId;
try {
rank = std::stoi(argv[2]);
gpuId = std::stoi(argv[3]);
} catch (const std::exception&) {
log("Error: rank and gpu_id must be valid integers.");
return -1;
}
if (rank < 0 || rank > 2 || gpuId < 0) {
log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
return -1;
}
worker(rank, gpuId, ipPort);
log("Rank ", rank, ": Succeed!");
return 0;
} else {
std::cerr << "Usage:\n"
<< " " << argv[0] << " Run in intra-node mode\n"
<< " " << argv[0] << " <ip_port> <rank> <gpu_id> Run in inter-node mode\n";
return -1;
}
int status0 = wait_process(pid0);
int status1 = wait_process(pid1);
if (status0 < 0 || status1 < 0) {
log("Failed to wait for processes.");
return -1;
}
if (status0 != 0 || status1 != 0) {
log("One of the processes failed.");
return -1;
}
log("Succeed!");
return 0;
}

View File

@@ -16,7 +16,7 @@
#define PORT_NUMBER "50505"
template <typename... Args>
void log(Args &&...args) {
void log(Args&&... args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
@@ -45,7 +45,7 @@ int wait_process(int pid) {
return -1;
}
__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) {
__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0) {
devHandle->signal();
@@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size
}
}
void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) {
void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) {
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
const int myRank = rank;
const int remoteRank = myRank == 0 ? 1 : 0;
@@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
auto portChanHandle = portChan.deviceHandle();
void *devHandle;
void* devHandle;
MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle)));
MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice));
@@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
std::function<void(size_t)> kernels[1];
kernels[0] = [&](size_t copyBytes) {
bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle *>(devHandle), copyBytes,
bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast<mscclpp::PortChannelDeviceHandle*>(devHandle), copyBytes,
myRank);
};
@@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t
bootstrap->barrier();
}
mscclpp::Transport parseTransport(const std::string &transportStr) {
mscclpp::Transport parseTransport(const std::string& transportStr) {
if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc;
if (transportStr == "IB0") return mscclpp::Transport::IB0;
if (transportStr == "IB1") return mscclpp::Transport::IB1;
@@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) {
throw std::runtime_error("Unknown transport: " + transportStr);
}
int main(int argc, char **argv) {
int main(int argc, char** argv) {
if (argc == 1) {
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); });

View File

@@ -0,0 +1,15 @@
CUDA_HOME ?= /usr/local/cuda
COMPILER := $(CUDA_HOME)/bin/nvcc
ARCH_FLAG := -arch=native
TARGET = bidir_switch_channel
SRC = bidir_switch_channel.cu
all: $(TARGET)
$(TARGET): $(SRC)
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
clean:
rm -f $(TARGET)

View File

@@ -0,0 +1,177 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include <sys/wait.h>
#include <unistd.h>
#include <functional>
#include <iostream>
#include <mscclpp/concurrency_device.hpp>
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mscclpp/switch_channel.hpp>
#include <mscclpp/switch_channel_device.hpp>
#include <sstream>
#define PORT_NUMBER "50505"
template <typename... Args>
void log(Args &&...args) {
std::stringstream ss;
(ss << ... << args);
ss << std::endl;
std::cout << ss.str();
}
int spawn_process(std::function<void()> func) {
pid_t pid = fork();
if (pid < 0) return -1;
if (pid == 0) {
// Child process
func();
exit(0);
}
return pid;
}
int wait_process(int pid) {
int status;
if (waitpid(pid, &status, 0) < 0) {
return -1;
}
if (WIFEXITED(status)) {
return WEXITSTATUS(status);
}
return -1;
}
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
__device__ mscclpp::DeviceSyncer devSyncer;
__global__ void kernelSwitchReduce(int rank, int numElements) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
// rank 0 performs on first half of data and rank 1 on second half
int min = rank * (numElements / 2);
int max = (rank + 1) * (numElements / 2);
for (int i = tid + min; i < max; i += stride) {
auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
gConstSwitchChan.broadcast(i, val);
}
}
void worker(int myRank, int gpuId, const std::string &ipPort) {
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
const int nRanks = 2;
const int iter = 1000;
const size_t bufferBytes = 128 * 1024 * 1024;
log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
// Build a connection and a semaphore
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
bootstrap->initialize(ipPort);
std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
std::vector<int> ranks;
ranks.reserve(nRanks);
for (int i = 0; i < nRanks; i++) ranks.push_back(i);
auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
auto deviceHandle = switchChannel.deviceHandle();
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
// Call the kernel in a loop for perf evaluation
for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
cudaEvent_t start, end;
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
}
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
bootstrap->barrier();
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
}
for (int i = 0; i < iter; ++i) {
kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
}
MSCCLPP_CUDATHROW(cudaGetLastError());
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
comm->bootstrap()->barrier();
if (myRank == 0) {
MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
float elapsedTime;
float elapsedTimePerIter;
float gbps;
MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
elapsedTimePerIter = elapsedTime / iter;
float dataSize = numElements * 4;
gbps = dataSize / elapsedTimePerIter * 1e-6f;
log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
gbps, " GB/s");
}
}
}
int main(int argc, char **argv) {
if (argc == 1) {
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
if (pid0 < 0 || pid1 < 0) {
log("Failed to spawn processes.");
return -1;
}
int status0 = wait_process(pid0);
int status1 = wait_process(pid1);
if (status0 < 0 || status1 < 0) {
log("Failed to wait for processes.");
return -1;
}
if (status0 != 0 || status1 != 0) {
log("One of the processes failed.");
return -1;
}
log("Succeed!");
return 0;
} else if (argc == 4) {
std::string ipPort = argv[1];
int rank, gpuId;
try {
rank = std::stoi(argv[2]);
gpuId = std::stoi(argv[3]);
} catch (const std::exception &) {
log("Error: rank and gpu_id must be valid integers.");
return -1;
}
if (rank < 0 || rank > 2 || gpuId < 0) {
log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
return -1;
}
worker(rank, gpuId, ipPort);
log("Rank ", rank, ": Succeed!");
return 0;
} else {
std::cerr << "Usage:\n"
<< " " << argv[0] << " Run in intra-node mode\n"
<< " " << argv[0] << " <ip_port> <rank> <gpu_id> Run in inter-node mode\n";
return -1;
}
}

View File

@@ -84,6 +84,11 @@ class Algorithm {
/// @return The Constraint struct specifying worldSize and nRanksPerNode requirements.
virtual Constraint constraint() const = 0;
/// Set the valid message size range for this algorithm.
/// @param minMessageSize Minimum supported message size in bytes.
/// @param maxMessageSize Maximum supported message size in bytes.
virtual void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) = 0;
/// Execute the algorithm.
/// @param comm The communicator to use.
/// @param input Pointer to the input buffer.
@@ -96,12 +101,16 @@ class Algorithm {
/// @param executor The executor for DSL algorithms (may be nullptr for native).
/// @param nBlocks Number of CUDA blocks (0 for auto-selection).
/// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
/// @param symmetricMemory Whether to use symmetric memory optimization.
/// @param extras Additional parameters for algorithm-specific customization.
/// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
/// @return The result of the operation.
virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) = 0;
/// Reset the algorithm state, clearing any cached contexts.
virtual void reset() = 0;
@@ -179,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
/// @param nBlocks Number of CUDA blocks.
/// @param nThreadsPerBlock Number of threads per block.
/// @param extras Additional algorithm-specific parameters.
/// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
/// @return The result of the operation.
using KernelFunc =
std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
/// Function type for creating algorithm contexts.
/// @param comm The communicator.
@@ -201,9 +211,10 @@ class NativeAlgorithm : public Algorithm {
/// @param inputSize Size of the input buffer.
/// @param outputSize Size of the output buffer.
/// @param dtype Data type of the elements.
/// @param symmetricMemory Whether symmetric memory is enabled.
/// @return A key uniquely identifying this buffer configuration.
using ContextKeyGenFunc = std::function<AlgorithmCtxKey(const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype)>;
size_t outputSize, DataType dtype, bool symmetricMemory)>;
/// Construct a NativeAlgorithm.
/// @param name Human-readable name of the algorithm.
@@ -225,10 +236,12 @@ class NativeAlgorithm : public Algorithm {
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) override;
const std::string& name() const override;
const std::string& collective() const override;
const std::pair<size_t, size_t>& messageRange() const override;
void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
const std::unordered_map<std::string, uint64_t>& tags() const override;
const CollectiveBufferMode& bufferMode() const override;
AlgorithmType type() const override { return AlgorithmType::Native; }
@@ -269,12 +282,14 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
const std::string& name() const override;
const std::string& collective() const override;
const std::pair<size_t, size_t>& messageRange() const override;
void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override;
const std::unordered_map<std::string, uint64_t>& tags() const override;
const CollectiveBufferMode& bufferMode() const override;
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) override;
AlgorithmType type() const override { return AlgorithmType::DSL; }
Constraint constraint() const override;
void reset() override;
@@ -299,6 +314,7 @@ struct CollectiveRequest {
const void* inputBuffer;
void* outputBuffer;
size_t messageSize;
cudaStream_t stream;
const std::string& collective;
const DataType dtype;
const std::unordered_map<std::string, std::vector<uint64_t>>& hints;
@@ -358,6 +374,10 @@ class AlgorithmCollection {
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
};
/// Get a default GPU flag buffer (allocated once and reused).
/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
} // namespace mscclpp
#endif // MSCCLPP_ALGORITHM_HPP_

View File

@@ -19,11 +19,11 @@
#else // defined(DEBUG_BUILD)
#if defined(MSCCLPP_DEVICE_HIP)
extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
const char *__function);
extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
const char* __function);
#else // !defined(MSCCLPP_DEVICE_HIP)
extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
const char *__function) __THROW;
extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line,
const char* __function) __THROW;
#endif // !defined(MSCCLPP_DEVICE_HIP)
/// Assert a condition on the device and print a message if the condition is false.

View File

@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
}
#elif defined(MSCCLPP_DEVICE_HIP)
#else // !defined(MSCCLPP_DEVICE_CUDA)
constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
// HIP does not have thread scope enums like CUDA
constexpr auto scopeSystem = 0;
constexpr auto scopeDevice = 0;
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
return __atomic_fetch_add(ptr, val, memoryOrder);
}
#endif // defined(MSCCLPP_DEVICE_HIP)
#endif // !defined(MSCCLPP_DEVICE_CUDA)
} // namespace mscclpp

View File

@@ -381,11 +381,19 @@ struct EndpointConfig {
/// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other
/// transports.
struct Ib {
/// IB mode for signaling, used to select between different implementations.
enum class Mode {
Default, // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset).
Host, // Use the host stack with RDMA atomics.
HostNoAtomic // Use the host stack with write-with-immediate signaling (no RDMA atomics).
};
static constexpr int DefaultPort = -1;
static constexpr int DefaultGidIndex = 0;
static constexpr int DefaultGidIndex = -1;
static constexpr int DefaultMaxCqSize = 1024;
static constexpr int DefaultMaxCqPollNum = 1;
static constexpr int DefaultMaxSendWr = 8192;
static constexpr int DefaultMaxRecvWr = 16;
static constexpr int DefaultMaxWrPerSend = 64;
/// Device index. Currently ignored; use transport type (IB0-IB7) to select device.
@@ -394,32 +402,41 @@ struct EndpointConfig {
int port;
/// GID index.
int gidIndex;
/// Maximum size of the completion queue.
/// Maximum size of the send completion queue.
int maxCqSize;
/// Maximum number of completion queue polls per operation.
/// Maximum number of send completion queue polls per operation.
int maxCqPollNum;
/// Maximum number of outstanding send work requests.
int maxSendWr;
/// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate).
int maxRecvWr;
/// Maximum number of work requests per send operation.
int maxWrPerSend;
/// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable.
Mode mode;
/// Constructor.
/// @param deviceIndex Device index.
/// @param port Port number.
/// @param gidIndex GID index.
/// @param maxCqSize Maximum completion queue size.
/// @param maxCqPollNum Maximum completion queue poll count.
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
/// @param maxCqSize Maximum send completion queue size.
/// @param maxCqPollNum Maximum send completion queue poll count.
/// @param maxSendWr Maximum outstanding send work requests.
/// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode).
/// @param maxWrPerSend Maximum work requests per send operation.
/// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable).
Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize,
int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend)
int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr,
int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default)
: deviceIndex(deviceIndex),
port(port),
gidIndex(gidIndex),
maxCqSize(maxCqSize),
maxCqPollNum(maxCqPollNum),
maxSendWr(maxSendWr),
maxWrPerSend(maxWrPerSend) {}
maxRecvWr(maxRecvWr),
maxWrPerSend(maxWrPerSend),
mode(mode) {}
};
/// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet).
@@ -658,6 +675,7 @@ class Connection {
friend class SemaphoreStub;
friend class Semaphore;
friend class ProxyService;
friend class BaseConnection;
};
/// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.

View File

@@ -54,6 +54,12 @@ class Env {
/// default libibverbs library found in the system.
const std::string ibvSo;
/// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel.
/// Allowed values:
/// - "host": use the host stack with RDMA atomics (default).
/// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics).
const std::string ibvMode;
/// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname.
/// This is used to determine whether the host is the same across different processes.
const std::string hostid;
@@ -70,9 +76,9 @@ class Env {
/// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this.
const std::string commId;
/// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to
/// use execution plans for the NCCL API. Unset by default.
const std::string executionPlanDir;
/// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files.
/// If unset, it defaults to `~/.cache/mscclpp`.
const std::string cacheDir;
/// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be
/// enabled and will dump traces to this directory. Unset by default.
@@ -92,17 +98,27 @@ class Env {
/// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`.
const std::string forceNcclFallbackOperation;
/// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs.
/// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with
/// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling
/// the channel cache won't affect the performance, but otherwise it may lead to performance degradation.
/// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory
/// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the
/// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections
/// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs
/// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator.
/// Default is false.
const bool disableChannelCache;
const bool ncclSymmetricMemory;
/// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++.
/// Default is false.
const bool forceDisableNvls;
/// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
/// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
/// Default is false.
const bool forceDisableGdr;
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
/// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
const int ibGidIndex;
private:
Env();

View File

@@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder {
/// @return The built AlgorithmCollection containing all registered algorithms.
AlgorithmCollection build();
AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank);
AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer,
size_t flagBufferSize, int rank);
private:
AlgorithmCollectionBuilder() = default;
@@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder {
AlgoSelectFunc algoSelector_ = nullptr;
AlgoSelectFunc fallbackAlgoSelector_ = nullptr;
AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize);
AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize,
uintptr_t flagBuffer, size_t flagBufferSize);
AlgorithmCollection buildDefaultDslAlgorithms(int rank);
static std::shared_ptr<AlgorithmCollectionBuilder> gAlgorithmCollectionBuilder_;

View File

@@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t;
using cudaDeviceProp = hipDeviceProp_t;
using cudaStream_t = hipStream_t;
using cudaStreamCaptureMode = hipStreamCaptureMode;
using cudaStreamCaptureStatus = hipStreamCaptureStatus;
using cudaMemcpyKind = hipMemcpyKind;
using cudaIpcMemHandle_t = hipIpcMemHandle_t;
@@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported;
constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone;
constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive;
constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated;
constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
@@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
#define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
#define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__)
#define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__)
#define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
#define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)

File diff suppressed because it is too large Load Diff

View File

@@ -29,7 +29,9 @@ class Proxy {
public:
/// Constructor.
/// @param handler Handler for each FIFO trigger.
/// @param threadInit Optional function run in proxy thread before FIFO consumption.
/// @param threadInit Optional function run once in the proxy thread before FIFO consumption.
/// The function should initialize thread runtime context before any CUDA API call in that thread
/// (for example, set CUDA device and optionally bind NUMA affinity).
/// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE).
Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize = DEFAULT_FIFO_SIZE);

View File

@@ -16,6 +16,7 @@ namespace mscclpp {
class Host2DeviceSemaphore {
private:
Semaphore semaphore_;
std::shared_ptr<uint64_t> inboundToken_;
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
std::unique_ptr<uint64_t> outboundToken_;
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
/// @param connection The connection associated with this semaphore.
Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
/// Destructor.
~Host2DeviceSemaphore();
/// Move constructor.
Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
/// Move assignment operator.
Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
/// Returns the connection.
/// @return The connection associated with this semaphore.
Connection& connection();
@@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore {
private:
Semaphore semaphore_;
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
detail::UniqueGpuPtr<uint64_t> outboundToken_;
public:
/// Constructor.

View File

@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
/// Signal remote device, ensures prior memory ops complete.
MSCCLPP_DEVICE_INLINE void signal() {
auto outbound = incOutbound();
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
// Using memoryOrderSeqCst is faster for A100.
atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
#else
atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
#if defined(MSCCLPP_DEVICE_CUDA)
asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
#elif defined(MSCCLPP_DEVICE_HIP)
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
#endif
}
/// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
MSCCLPP_DEVICE_INLINE void relaxedSignal() {
auto outbound = incOutbound();
atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
#if defined(MSCCLPP_DEVICE_CUDA)
asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
#elif defined(MSCCLPP_DEVICE_HIP)
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
#endif
}
/// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
}
/// Thread-safe read of outbound value.
/// @return The outbound value.
MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
}
/// Thread-safe increment of outbound value.
/// @return The incremented outbound value.
MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
}
#endif // defined(MSCCLPP_DEVICE_COMPILE)
/// A local memory space where the remote device will write its semaphore value and the local device will read it.
uint64_t* inboundToken;
/// A local memory space where the local device stores the semaphore value to be written to the remote device.
uint64_t* outboundToken;
/// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
/// remote device.
/// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
uint64_t* remoteInboundToken;
/// A local memory space where the local device stores the expected value of the inboundToken to wait for.

View File

@@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle {
: "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
: "l"(ptr)
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];"
: "=r"(val.words[0]), "=r"(val.words[1])
: "l"(ptr)
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];"
: "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
: "l"(ptr)
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];"
: "=r"(val.words[0]), "=r"(val.words[1])
: "l"(ptr)
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];"
: "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3])
: "l"(ptr)
@@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle {
asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
"r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x4>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x4>) {
asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x8>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x8>) {
asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
"r"(val.words[1])
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e4m3x16>) {
} else if constexpr (std::is_same_v<VectorType, f8_e4m3x16>) {
asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
"r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x4>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x4>) {
asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x8>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x8>) {
asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]),
"r"(val.words[1])
: "memory");
} else if constexpr (std::is_same_v<VectorType, fp8_e5m2x16>) {
} else if constexpr (std::is_same_v<VectorType, f8_e5m2x16>) {
asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]),
"r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3])
: "memory");

View File

@@ -4,6 +4,10 @@
add_subdirectory(csrc)
add_subdirectory(test)
target_compile_definitions(mscclpp_py PRIVATE
$<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
)
add_custom_target(pytest_lib_copy ALL
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
DEPENDS mscclpp_py mscclpp_py_test
)
)

View File

@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
if(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
endif()
install(TARGETS mscclpp_py LIBRARY DESTINATION .)

View File

@@ -16,14 +16,16 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_algorithm(nb::module_& m) {
nb::enum_<CollectiveBufferMode>(m, "CollectiveBufferMode")
nb::enum_<CollectiveBufferMode>(m, "CppCollectiveBufferMode")
.value("ANY", CollectiveBufferMode::Any)
.value("IN_PLACE", CollectiveBufferMode::InPlace)
.value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace);
nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL);
nb::enum_<AlgorithmType>(m, "CppAlgorithmType")
.value("NATIVE", AlgorithmType::Native)
.value("DSL", AlgorithmType::DSL);
nb::enum_<CommResult>(m, "CommResult")
nb::enum_<CommResult>(m, "CppCommResult")
.value("COMM_SUCCESS", CommResult::CommSuccess)
.value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError)
.value("COMM_SYSTEM_ERROR", CommResult::CommSystemError)
@@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) {
.value("COMM_IN_PROGRESS", CommResult::CommInProgress)
.value("COMM_NUM_RESULTS", CommResult::CommNumResults);
nb::enum_<ReduceOp>(m, "ReduceOp")
nb::enum_<ReduceOp>(m, "CppReduceOp")
.value("SUM", ReduceOp::SUM)
.value("MIN", ReduceOp::MIN)
.value("NOP", ReduceOp::NOP);
auto algorithmClass =
nb::class_<Algorithm>(m, "Algorithm")
nb::class_<Algorithm>(m, "CppAlgorithm")
.def_static(
"from_native_capsule",
[](nb::capsule cap) {
@@ -58,6 +60,12 @@ void register_algorithm(nb::module_& m) {
.def_prop_ro("name", &Algorithm::name)
.def_prop_ro("collective", &Algorithm::collective)
.def_prop_ro("message_range", &Algorithm::messageRange)
.def(
"set_message_size_range",
[](Algorithm& self, size_t minMessageSize, size_t maxMessageSize) {
self.setMessageSizeRange(minMessageSize, maxMessageSize);
},
nb::arg("min_message_size"), nb::arg("max_message_size"))
.def_prop_ro("tags", &Algorithm::tags)
.def_prop_ro("buffer_mode", &Algorithm::bufferMode)
.def_prop_ro("constraint", &Algorithm::constraint)
@@ -67,16 +75,19 @@ void register_algorithm(nb::module_& m) {
"execute",
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock,
std::unordered_map<std::string, uintptr_t> extras) {
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
nBlocks, nThreadsPerBlock, extras);
nBlocks, nThreadsPerBlock, symmetricMemory, extras,
static_cast<DataType>(accumDtype));
},
nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>());
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
.def("reset", &Algorithm::reset);
nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
.def(nb::init<>())
@@ -84,21 +95,21 @@ void register_algorithm(nb::module_& m) {
.def_rw("world_size", &Algorithm::Constraint::worldSize)
.def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode);
nb::class_<AlgorithmBuilder>(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build);
nb::class_<AlgorithmBuilder>(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build);
nb::class_<DslAlgorithm, Algorithm>(m, "DslAlgorithm")
nb::class_<DslAlgorithm, Algorithm>(m, "CppDslAlgorithm")
.def(nb::init<std::string, ExecutionPlan, std::unordered_map<std::string, uint64_t>, Algorithm::Constraint>(),
nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map<std::string, uint64_t>(),
nb::arg("constraint") = Algorithm::Constraint())
.def("build", &DslAlgorithm::build);
nb::class_<AlgorithmCollection>(m, "AlgorithmCollection")
nb::class_<AlgorithmCollection>(m, "CppAlgorithmCollection")
.def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"),
nb::arg("algorithm"))
.def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective"))
.def("to_list", &AlgorithmCollection::getAllAlgorithms);
nb::class_<CollectiveRequest>(m, "CollectiveRequest")
nb::class_<CollectiveRequest>(m, "CppCollectiveRequest")
.def_ro("world_size", &CollectiveRequest::worldSize)
.def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode)
.def_ro("rank", &CollectiveRequest::rank)
@@ -107,8 +118,22 @@ void register_algorithm(nb::module_& m) {
.def_prop_ro("output_buffer",
[](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.outputBuffer); })
.def_ro("message_size", &CollectiveRequest::messageSize)
.def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast<uintptr_t>(self.stream); })
.def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; })
.def_ro("dtype", &CollectiveRequest::dtype)
.def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; })
.def("buffer_mode", &CollectiveRequest::bufferMode);
m.def(
"cpp_get_flag_buffer",
[]() {
auto [buffer, size] = getFlagBuffer();
uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
// Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
prevent.release(); // capsule now owns the pointer
return nb::make_tuple(ptr, size, owner);
},
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
}

View File

@@ -32,21 +32,25 @@ extern void register_algorithm_collection_builder(nb::module_& m);
template <typename T>
void def_shared_future(nb::handle& m, const std::string& typestr) {
std::string pyclass_name = std::string("shared_future_") + typestr;
std::string pyclass_name = std::string("CppSharedFuture_") + typestr;
nb::class_<std::shared_future<T>>(m, pyclass_name.c_str()).def("get", &std::shared_future<T>::get);
}
void register_core(nb::module_& m) {
m.def("version", &version);
nb::enum_<DataType>(m, "DataType")
nb::enum_<DataType>(m, "CppDataType")
.value("int32", DataType::INT32)
.value("uint32", DataType::UINT32)
.value("float16", DataType::FLOAT16)
.value("float32", DataType::FLOAT32)
.value("bfloat16", DataType::BFLOAT16);
.value("bfloat16", DataType::BFLOAT16)
.value("float8_e4m3", DataType::FLOAT8_E4M3)
.value("float8_e5m2", DataType::FLOAT8_E5M2)
.value("uint8", DataType::UINT8)
.value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
nb::class_<Bootstrap>(m, "Bootstrap")
nb::class_<Bootstrap>(m, "CppBootstrap")
.def("get_rank", &Bootstrap::getRank)
.def("get_n_ranks", &Bootstrap::getNranks)
.def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode)
@@ -71,7 +75,7 @@ void register_core(nb::module_& m) {
.def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
nb::arg("peer"), nb::arg("tag"));
nb::class_<UniqueId>(m, "UniqueId")
nb::class_<UniqueId>(m, "CppUniqueId")
.def(nb::init<>())
.def("__setstate__",
[](UniqueId& self, nb::bytes b) {
@@ -81,7 +85,7 @@ void register_core(nb::module_& m) {
.def("__getstate__",
[](const UniqueId& self) { return nb::bytes(reinterpret_cast<const char*>(self.data()), UniqueIdBytes); });
nb::class_<TcpBootstrap, Bootstrap>(m, "TcpBootstrap")
nb::class_<TcpBootstrap, Bootstrap>(m, "CppTcpBootstrap")
.def(nb::init<int, int>(), "Do not use this constructor. Use create instead.")
.def_static(
"create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
@@ -93,7 +97,7 @@ void register_core(nb::module_& m) {
.def("initialize", static_cast<void (TcpBootstrap::*)(const std::string&, int64_t)>(&TcpBootstrap::initialize),
nb::call_guard<nb::gil_scoped_release>(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30);
nb::enum_<Transport>(m, "Transport")
nb::enum_<Transport>(m, "CppTransport")
.value("Unknown", Transport::Unknown)
.value("CudaIpc", Transport::CudaIpc)
.value("IB0", Transport::IB0)
@@ -106,7 +110,7 @@ void register_core(nb::module_& m) {
.value("IB7", Transport::IB7)
.value("NumTransports", Transport::NumTransports);
nb::class_<TransportFlags>(m, "TransportFlags")
nb::class_<TransportFlags>(m, "CppTransportFlags")
.def(nb::init<>())
.def(nb::init_implicit<Transport>(), nb::arg("transport"))
.def("has", &TransportFlags::has, nb::arg("transport"))
@@ -130,12 +134,12 @@ void register_core(nb::module_& m) {
.def(nb::self == nb::self)
.def(nb::self != nb::self);
nb::enum_<DeviceType>(m, "DeviceType")
nb::enum_<DeviceType>(m, "CppDeviceType")
.value("Unknown", DeviceType::Unknown)
.value("CPU", DeviceType::CPU)
.value("GPU", DeviceType::GPU);
nb::class_<Device>(m, "Device")
nb::class_<Device>(m, "CppDevice")
.def(nb::init<>())
.def(nb::init_implicit<DeviceType>(), nb::arg("type"))
.def(nb::init<DeviceType, int>(), nb::arg("type"), nb::arg("id") = -1)
@@ -147,24 +151,33 @@ void register_core(nb::module_& m) {
return ss.str();
});
nb::class_<EndpointConfig::Ib>(m, "EndpointConfigIb")
nb::enum_<EndpointConfig::Ib::Mode>(m, "CppIbMode")
.value("Default", EndpointConfig::Ib::Mode::Default)
.value("Host", EndpointConfig::Ib::Mode::Host)
.value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic);
nb::class_<EndpointConfig::Ib>(m, "CppEndpointConfigIb")
.def(nb::init<>())
.def(nb::init<int, int, int, int, int, int, int>(), nb::arg("device_index") = -1,
.def(nb::init<int, int, int, int, int, int, int, int, EndpointConfig::Ib::Mode>(), nb::arg("device_index") = -1,
nb::arg("port") = EndpointConfig::Ib::DefaultPort,
nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex,
nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize,
nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum,
nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr,
nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend)
nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr,
nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend,
nb::arg("mode") = EndpointConfig::Ib::Mode::Default)
.def_rw("device_index", &EndpointConfig::Ib::deviceIndex)
.def_rw("port", &EndpointConfig::Ib::port)
.def_rw("gid_index", &EndpointConfig::Ib::gidIndex)
.def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize)
.def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum)
.def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr)
.def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend);
.def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr)
.def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend)
.def_rw("mode", &EndpointConfig::Ib::mode);
nb::class_<RegisteredMemory>(m, "RegisteredMemory")
nb::class_<RegisteredMemory>(m, "CppRegisteredMemory")
.def(nb::init<>())
.def("data", [](RegisteredMemory& self) { return reinterpret_cast<uintptr_t>(self.data()); })
.def("size", &RegisteredMemory::size)
@@ -172,7 +185,7 @@ void register_core(nb::module_& m) {
.def("serialize", &RegisteredMemory::serialize)
.def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data"));
nb::class_<Endpoint>(m, "Endpoint")
nb::class_<Endpoint>(m, "CppEndpoint")
.def("config", &Endpoint::config)
.def("transport", &Endpoint::transport)
.def("device", &Endpoint::device)
@@ -180,7 +193,7 @@ void register_core(nb::module_& m) {
.def("serialize", &Endpoint::serialize)
.def_static("deserialize", &Endpoint::deserialize, nb::arg("data"));
nb::class_<Connection>(m, "Connection")
nb::class_<Connection>(m, "CppConnection")
.def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"),
nb::arg("size"))
.def(
@@ -197,7 +210,7 @@ void register_core(nb::module_& m) {
.def("local_device", &Connection::localDevice)
.def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize);
nb::class_<EndpointConfig>(m, "EndpointConfig")
nb::class_<EndpointConfig>(m, "CppEndpointConfig")
.def(nb::init<>())
.def(nb::init_implicit<Transport>(), nb::arg("transport"))
.def(nb::init<Transport, Device, int, EndpointConfig::Ib>(), nb::arg("transport"), nb::arg("device"),
@@ -223,12 +236,18 @@ void register_core(nb::module_& m) {
.def_prop_rw(
"ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; },
[](EndpointConfig& self, int v) { self.ib.maxSendWr = v; })
.def_prop_rw(
"ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; },
[](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; })
.def_prop_rw(
"ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; },
[](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; })
.def_prop_rw(
"ib_mode", [](EndpointConfig& self) { return self.ib.mode; },
[](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; })
.def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize);
nb::class_<Context>(m, "Context")
nb::class_<Context>(m, "CppContext")
.def_static("create", &Context::create)
.def(
"register_memory",
@@ -239,13 +258,13 @@ void register_core(nb::module_& m) {
.def("create_endpoint", &Context::createEndpoint, nb::arg("config"))
.def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint"));
nb::class_<SemaphoreStub>(m, "SemaphoreStub")
nb::class_<SemaphoreStub>(m, "CppSemaphoreStub")
.def(nb::init<const Connection&>(), nb::arg("connection"))
.def("memory", &SemaphoreStub::memory)
.def("serialize", &SemaphoreStub::serialize)
.def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data"));
nb::class_<Semaphore>(m, "Semaphore")
nb::class_<Semaphore>(m, "CppSemaphore")
.def(nb::init<>())
.def(nb::init<const SemaphoreStub&, const SemaphoreStub&>(), nb::arg("local_stub"), nb::arg("remote_stub"))
.def("connection", &Semaphore::connection)
@@ -256,7 +275,7 @@ void register_core(nb::module_& m) {
def_shared_future<Connection>(m, "Connection");
def_shared_future<Semaphore>(m, "Semaphore");
nb::class_<Communicator>(m, "Communicator")
nb::class_<Communicator>(m, "CppCommunicator")
.def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
nb::arg("context") = nullptr)
.def("bootstrap", &Communicator::bootstrap)
@@ -289,6 +308,9 @@ void register_core(nb::module_& m) {
}
NB_MODULE(_mscclpp, m) {
#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
nb::set_leak_warnings(false);
#endif
register_env(m);
register_error(m);
register_port_channel(m);
@@ -306,4 +328,4 @@ NB_MODULE(_mscclpp, m) {
// ext
register_algorithm_collection_builder(m);
}
}

View File

@@ -11,7 +11,7 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_env(nb::module_& m) {
nb::class_<Env>(m, "Env")
nb::class_<Env>(m, "CppEnv")
.def_ro("debug", &Env::debug)
.def_ro("debug_subsys", &Env::debugSubsys)
.def_ro("debug_file", &Env::debugFile)
@@ -20,9 +20,11 @@ void register_env(nb::module_& m) {
.def_ro("socket_family", &Env::socketFamily)
.def_ro("socket_ifname", &Env::socketIfname)
.def_ro("comm_id", &Env::commId)
.def_ro("execution_plan_dir", &Env::executionPlanDir)
.def_ro("ibv_mode", &Env::ibvMode)
.def_ro("cache_dir", &Env::cacheDir)
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
.def_ro("ib_gid_index", &Env::ibGidIndex);
m.def("env", &env);
}

View File

@@ -11,18 +11,18 @@ using namespace mscclpp;
#define REGISTER_EXCEPTION_TRANSLATOR(name_) \
nb::register_exception_translator( \
[](const std::exception_ptr &p, void *payload) { \
[](const std::exception_ptr& p, void* payload) { \
try { \
std::rethrow_exception(p); \
} catch (const name_ &e) { \
PyErr_SetObject(reinterpret_cast<PyObject *>(payload), \
} catch (const name_& e) { \
PyErr_SetObject(reinterpret_cast<PyObject*>(payload), \
PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \
} \
}, \
m.attr(#name_).ptr());
void register_error(nb::module_ &m) {
nb::enum_<ErrorCode>(m, "ErrorCode")
void register_error(nb::module_& m) {
nb::enum_<ErrorCode>(m, "CppErrorCode")
.value("SystemError", ErrorCode::SystemError)
.value("InternalError", ErrorCode::InternalError)
.value("RemoteError", ErrorCode::RemoteError)

View File

@@ -15,16 +15,16 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_executor(nb::module_& m) {
nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
nb::enum_<PacketType>(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
nb::class_<ExecutionPlan>(m, "ExecutionPlan")
nb::class_<ExecutionPlan>(m, "CppExecutionPlan")
.def(nb::init<const std::string&, int>(), nb::arg("planPath"), nb::arg("rank"))
.def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); })
.def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); })
.def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); })
.def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); });
nb::class_<Executor>(m, "Executor")
nb::class_<Executor>(m, "CppExecutor")
.def(nb::init<std::shared_ptr<Communicator>>(), nb::arg("comm"))
.def(
"execute",

View File

@@ -4,6 +4,7 @@
#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/vector.h>
@@ -15,7 +16,7 @@ using namespace mscclpp;
using namespace mscclpp::collective;
void register_algorithm_collection_builder(nb::module_& m) {
nb::class_<AlgorithmCollectionBuilder>(m, "AlgorithmCollectionBuilder")
nb::class_<AlgorithmCollectionBuilder>(m, "CppAlgorithmCollectionBuilder")
.def_static("get_instance", &AlgorithmCollectionBuilder::getInstance)
.def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder"))
.def(
@@ -29,6 +30,6 @@ void register_algorithm_collection_builder(nb::module_& m) {
nb::arg("selector"))
.def("build", &AlgorithmCollectionBuilder::build)
.def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"),
nb::arg("scratch_buffer_size"), nb::arg("rank"))
nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank"))
.def_static("reset", &AlgorithmCollectionBuilder::reset);
}

View File

@@ -9,7 +9,7 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_fifo(nb::module_& m) {
nb::class_<ProxyTrigger>(m, "ProxyTrigger")
nb::class_<ProxyTrigger>(m, "CppProxyTrigger")
.def_prop_rw(
"fst", [](const ProxyTrigger& self) { return self.fst; },
[](ProxyTrigger& self, uint64_t v) { self.fst = v; })
@@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) {
"snd", [](const ProxyTrigger& self) { return self.snd; },
[](ProxyTrigger& self, uint64_t v) { self.snd = v; });
nb::class_<FifoDeviceHandle>(m, "FifoDeviceHandle")
nb::class_<FifoDeviceHandle>(m, "CppFifoDeviceHandle")
.def_rw("triggers", &FifoDeviceHandle::triggers)
.def_rw("tail", &FifoDeviceHandle::tail)
.def_rw("head", &FifoDeviceHandle::head)
@@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) {
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
});
nb::class_<Fifo>(m, "Fifo")
nb::class_<Fifo>(m, "CppFifo")
.def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
.def("poll", &Fifo::poll)
.def("pop", &Fifo::pop)

View File

@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
return DLDataType{kDLBfloat, 16, 1};
} else if (type == "torch.float16") {
return DLDataType{kDLFloat, 16, 1};
} else if (type == "torch.float8_e4m3fn") {
return DLDataType{kDLFloat8_e4m3fn, 8, 1};
} else if (type == "torch.float8_e4m3fnuz") {
return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
} else if (type == "torch.float8_e5m2") {
return DLDataType{kDLFloat8_e5m2, 8, 1};
} else if (type == "torch.float8_e5m2fnuz") {
return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
} else if (type == "torch.uint8") {
return DLDataType{kDLUInt, 8, 1};
} else if (type == "fp8_e4m3b15") {
// No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
return DLDataType{kDLUInt, 8, 1};
} else {
throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
}
@@ -101,7 +114,7 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
void register_gpu_utils(nb::module_& m) {
m.def("is_nvls_supported", &isNvlsSupported);
nb::class_<GpuBuffer<char>>(m, "RawGpuBuffer")
nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
.def(nb::init<size_t>(), nb::arg("nelems"))
.def("nelems", &GpuBuffer<char>::nelems)
.def("bytes", &GpuBuffer<char>::bytes)

View File

@@ -11,20 +11,20 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_memory_channel(nb::module_& m) {
nb::class_<BaseMemoryChannel>(m, "BaseMemoryChannel")
nb::class_<BaseMemoryChannel>(m, "CppBaseMemoryChannel")
.def(nb::init<>())
.def(nb::init<std::shared_ptr<MemoryDevice2DeviceSemaphore>>(), nb::arg("semaphore"))
.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
.def("device_handle", &BaseMemoryChannel::deviceHandle);
nb::class_<BaseMemoryChannel::DeviceHandle>(m, "BaseMemoryChannelDeviceHandle")
nb::class_<BaseMemoryChannel::DeviceHandle>(m, "CppBaseMemoryChannelDeviceHandle")
.def(nb::init<>())
.def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_)
.def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes {
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
});
nb::class_<MemoryChannel>(m, "MemoryChannel")
nb::class_<MemoryChannel>(m, "CppMemoryChannel")
.def(nb::init<>())
.def(
"__init__",
@@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) {
nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
.def("device_handle", &MemoryChannel::deviceHandle);
nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
nb::class_<MemoryChannel::DeviceHandle>(m, "CppMemoryChannelDeviceHandle")
.def(nb::init<>())
.def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
.def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)

View File

@@ -8,8 +8,8 @@
namespace nb = nanobind;
void register_npkit(nb::module_ &m) {
nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions");
void register_npkit(nb::module_& m) {
nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions");
sub_m.def("init", &NpKit::Init);
sub_m.def("dump", &NpKit::Dump);
sub_m.def("shutdown", &NpKit::Shutdown);

View File

@@ -6,8 +6,8 @@ int getDeviceNumaNode(int cudaDev);
void numaBind(int node);
}; // namespace mscclpp
void register_numa(nb::module_ &m) {
nb::module_ sub_m = m.def_submodule("numa", "numa functions");
void register_numa(nb::module_& m) {
nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions");
sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode);
sub_m.def("numa_bind", &mscclpp::numaBind);
}

View File

@@ -11,11 +11,11 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_port_channel(nb::module_& m) {
nb::class_<BaseProxyService>(m, "BaseProxyService")
nb::class_<BaseProxyService>(m, "CppBaseProxyService")
.def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false)
.def("stop_proxy", &BaseProxyService::stopProxy);
nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
nb::class_<ProxyService, BaseProxyService>(m, "CppProxyService")
.def(nb::init<int>(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE)
.def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false)
.def("stop_proxy", &ProxyService::stopProxy)
@@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) {
.def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
.def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
nb::class_<BasePortChannel>(m, "BasePortChannel")
nb::class_<BasePortChannel>(m, "CppBasePortChannel")
.def(nb::init<>())
.def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"))
.def("device_handle", &BasePortChannel::deviceHandle);
nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
nb::class_<BasePortChannel::DeviceHandle>(m, "CppBasePortChannelDeviceHandle")
.def(nb::init<>())
.def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_)
.def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
@@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) {
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
});
nb::class_<PortChannel>(m, "PortChannel")
nb::class_<PortChannel>(m, "CppPortChannel")
.def(nb::init<>())
.def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
.def("device_handle", &PortChannel::deviceHandle);
nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
nb::class_<PortChannel::DeviceHandle>(m, "CppPortChannelDeviceHandle")
.def(nb::init<>())
.def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_)
.def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)

View File

@@ -10,7 +10,7 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_semaphore(nb::module_& m) {
nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "Host2DeviceSemaphore");
nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "CppHost2DeviceSemaphore");
host2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
.def("connection", &Host2DeviceSemaphore::connection)
@@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) {
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
});
nb::class_<Host2HostSemaphore>(m, "Host2HostSemaphore")
nb::class_<Host2HostSemaphore>(m, "CppHost2HostSemaphore")
.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
.def("connection", &Host2HostSemaphore::connection)
@@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) {
.def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
nb::arg("max_spin_count") = 10000000);
nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore");
memoryDevice2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
.def("connection", &MemoryDevice2DeviceSemaphore::connection)
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
.def(nb::init<>())
.def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
.def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
.def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
.def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
.def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {

View File

@@ -15,11 +15,11 @@ namespace nb = nanobind;
using namespace mscclpp;
void register_nvls(nb::module_& m) {
nb::class_<SwitchChannel>(m, "SwitchChannel")
nb::class_<SwitchChannel>(m, "CppSwitchChannel")
.def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); })
.def("device_handle", &SwitchChannel::deviceHandle);
nb::class_<SwitchChannel::DeviceHandle>(m, "DeviceHandle")
nb::class_<SwitchChannel::DeviceHandle>(m, "CppSwitchChannelDeviceHandle")
.def(nb::init<>())
.def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr)
.def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr)
@@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) {
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
});
nb::class_<NvlsConnection>(m, "NvlsConnection")
nb::class_<NvlsConnection>(m, "CppNvlsConnection")
.def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size"));
m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"),

View File

@@ -23,35 +23,37 @@ version = {
from ._core import *
from ._mscclpp import (
Device,
DeviceType,
Communicator,
Connection,
CppDevice as Device,
CppDeviceType as DeviceType,
CppCommunicator as Communicator,
CppConnection as Connection,
connect_nvls_collective,
EndpointConfig,
Fifo,
Semaphore,
Host2DeviceSemaphore,
Host2HostSemaphore,
numa,
ProxyService,
RegisteredMemory,
PortChannel,
MemoryChannel,
MemoryDevice2DeviceSemaphore,
TcpBootstrap,
Transport,
TransportFlags,
DataType,
ErrorCode,
Executor,
ExecutionPlan,
PacketType,
RawGpuBuffer,
ReduceOp,
CppEndpointConfig as EndpointConfig,
CppEndpointConfigIb as EndpointConfigIb,
CppIbMode as IbMode,
CppFifo as Fifo,
CppSemaphore as Semaphore,
CppHost2DeviceSemaphore as Host2DeviceSemaphore,
CppHost2HostSemaphore as Host2HostSemaphore,
cpp_numa as numa,
CppProxyService as ProxyService,
CppRegisteredMemory as RegisteredMemory,
CppPortChannel as PortChannel,
CppMemoryChannel as MemoryChannel,
CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore,
CppTcpBootstrap as TcpBootstrap,
CppTransport as Transport,
CppTransportFlags as TransportFlags,
CppDataType as DataType,
CppErrorCode as ErrorCode,
CppExecutor as Executor,
CppExecutionPlan as ExecutionPlan,
CppPacketType as PacketType,
CppRawGpuBuffer as RawGpuBuffer,
CppReduceOp as ReduceOp,
env,
is_nvls_supported,
npkit,
cpp_npkit as npkit,
)
__all__ = [
@@ -61,6 +63,8 @@ __all__ = [
"Connection",
"connect_nvls_collective",
"EndpointConfig",
"EndpointConfigIb",
"IbMode",
"ErrorCode",
"Fifo",
"Semaphore",

View File

@@ -6,7 +6,7 @@ import shutil
import argparse
from pathlib import Path
from mscclpp.language import default_algos as def_algo
from mscclpp import default_algos as def_algo
from mscclpp.language.collectives import *
from mscclpp.language.utils import AlgoSpec
@@ -57,7 +57,7 @@ default_algo_configs = [
def create_default_plans():
plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default")
plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
plan_path = Path(plan_dir)
if plan_path.exists():
shutil.rmtree(plan_path)

View File

@@ -5,9 +5,3 @@ from .algorithm import *
from .comm import *
from .compiler import *
from .buffer import *
__all__ = []
__all__ += algorithm.__all__
__all__ += comm.__all__
__all__ += compiler.__all__
__all__ += buffer.__all__

View File

@@ -4,18 +4,22 @@
from __future__ import annotations
from typing import Optional, Tuple, Dict
from functools import cached_property
import cupy as cp
from mscclpp._mscclpp import (
Algorithm as _Algorithm,
DslAlgorithm as _DslAlgorithm,
AlgorithmType as _AlgorithmType,
Communicator,
CollectiveBufferMode,
DataType,
Executor,
ExecutionPlan,
ReduceOp,
CppAlgorithm,
CppDslAlgorithm,
CppAlgorithmType,
CppCommunicator,
CppCollectiveBufferMode,
CppDataType,
CppExecutor,
CppExecutionPlan,
CppReduceOp,
CppAlgorithmBuilder,
CppAlgorithmCollection,
cpp_get_flag_buffer,
)
__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
@@ -45,7 +49,7 @@ class Algorithm:
"""
def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0):
self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node)
self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node)
@property
def world_size(self) -> int:
@@ -58,23 +62,23 @@ class Algorithm:
def __init__(
self,
id: Optional[str] = None,
execution_plan: Optional[ExecutionPlan] = None,
native_handle: Optional[_Algorithm] = None,
execution_plan: Optional[CppExecutionPlan] = None,
native_handle: Optional[CppAlgorithm] = None,
tags: Optional[Dict[str, int]] = None,
constraint: Optional[Constraint] = None,
):
if execution_plan is not None:
self._algorithm = _DslAlgorithm(
self._algorithm = CppDslAlgorithm(
id,
execution_plan,
tags=tags if tags is not None else {},
constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(),
constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(),
)
elif native_handle is not None:
self._algorithm = native_handle
@classmethod
def create_from_native_handle(cls, handle: _Algorithm):
def create_from_native_handle(cls, handle: CppAlgorithm):
"""Create an Algorithm instance from a native C++ algorithm handle.
Args:
@@ -97,7 +101,7 @@ class Algorithm:
Returns:
A new Algorithm instance wrapping the algorithm from the capsule.
"""
handle = _Algorithm.from_native_capsule(obj)
handle = CppAlgorithm.from_native_capsule(obj)
return cls(native_handle=handle)
@cached_property
@@ -110,18 +114,31 @@ class Algorithm:
"""The collective operation this algorithm implements (e.g., "allreduce", "allgather")."""
return self._algorithm.collective
@cached_property
@property
def message_size_range(self) -> Tuple[int, int]:
"""The valid message size range (min_size, max_size) in bytes."""
return (self._algorithm.message_range[0], self._algorithm.message_range[1])
def set_message_size_range(self, min_message_size: int, max_message_size: int):
"""Set the valid message size range in bytes.
Args:
min_message_size: Minimum supported message size in bytes.
max_message_size: Maximum supported message size in bytes.
Only supported for native algorithms. Raises TypeError for DSL algorithms.
"""
if self.is_dsl_algorithm():
raise TypeError("set_message_size_range is only supported for native algorithms")
self._algorithm.set_message_size_range(min_message_size, max_message_size)
@cached_property
def tags(self) -> Dict[str, int]:
"""Dictionary of tag names to tag values for algorithm selection hints."""
return self._algorithm.tags
@cached_property
def buffer_mode(self) -> CollectiveBufferMode:
def buffer_mode(self) -> CppCollectiveBufferMode:
"""The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY)."""
return self._algorithm.buffer_mode
@@ -131,7 +148,7 @@ class Algorithm:
Returns:
True if this algorithm is defined using DSL/execution plan, False otherwise.
"""
if self._algorithm.type == _AlgorithmType.DSL:
if self._algorithm.type == CppAlgorithmType.DSL:
return True
return False
@@ -141,24 +158,26 @@ class Algorithm:
Returns:
True if this algorithm is implemented natively, False otherwise.
"""
if self._algorithm.type == _AlgorithmType.NATIVE:
if self._algorithm.type == CppAlgorithmType.NATIVE:
return True
return False
def execute(
self,
comm: Communicator,
comm: CppCommunicator,
input_buffer: int,
output_buffer: int,
input_size: int,
output_size: int,
dtype: DataType,
op: ReduceOp = ReduceOp.NOP,
dtype: CppDataType,
op: CppReduceOp = CppReduceOp.NOP,
stream: int = 0,
executor: Optional[Executor] = None,
executor: Optional[CppExecutor] = None,
nblocks=0,
nthreads_per_block=0,
symmetric_memory: bool = False,
extras: Optional[Dict[str, int]] = None,
accum_dtype: Optional[CppDataType] = None,
) -> int:
"""Execute the collective algorithm.
@@ -174,11 +193,16 @@ class Algorithm:
executor: The executor for DSL algorithms (required for DSL, optional for native).
nblocks: Number of CUDA blocks (0 for auto-selection).
nthreads_per_block: Number of threads per block (0 for auto-selection).
symmetric_memory: Whether to use symmetric memory optimization (default: False).
extras: Additional algorithm-specific parameters.
accum_dtype: Data type for accumulation during reduction. If None, defaults to
the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
Returns:
The result code (0 for success).
"""
merged_extras = dict(extras) if extras is not None else {}
accum_dtype = accum_dtype if accum_dtype is not None else dtype
return self._algorithm.execute(
comm,
int(input_buffer),
@@ -191,12 +215,18 @@ class Algorithm:
executor,
nblocks,
nthreads_per_block,
extras if extras is not None else {},
symmetric_memory,
merged_extras,
int(accum_dtype),
)
def reset(self):
"""Reset the internal state of the algorithm, if applicable."""
self._algorithm.reset()
class AlgorithmBuilder:
def __init__(self, algorithm_builder: _AlgorithmBuilder):
def __init__(self, algorithm_builder: CppAlgorithmBuilder):
self._algorithm_builder = algorithm_builder
def build(self) -> Algorithm:
@@ -204,7 +234,7 @@ class AlgorithmBuilder:
class AlgorithmCollection:
def __init__(self, native_collection: _AlgorithmCollection):
def __init__(self, native_collection: CppAlgorithmCollection):
self._native_collection = native_collection
self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()]
@@ -228,3 +258,24 @@ class AlgorithmCollection:
"""Register an algorithm for a collective operation."""
self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
self._algorithms.append(algorithm)
_flag_buffer_cache = None
def get_flag_buffer() -> cp.ndarray:
"""Get the default flag buffer for algorithm selection.
This buffer is used internally by default algorithms to store selection flags.
It is allocated as a shared GPU buffer and can be accessed from Python.
The result is cached so all callers share the same buffer.
Returns:
A CuPy array representing the flag buffer on the GPU.
"""
global _flag_buffer_cache
if _flag_buffer_cache is None:
buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
_flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
return _flag_buffer_cache

View File

@@ -6,7 +6,7 @@ from typing import Union, Tuple
import cupy as cp
import numpy as np
from mscclpp._mscclpp import RawGpuBuffer
from mscclpp._mscclpp import CppRawGpuBuffer
__all__ = ["GpuBuffer"]
@@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray):
if any(s <= 0 for s in shape):
raise ValueError("Shape must be positive.")
# Create the buffer
buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)

View File

@@ -6,21 +6,21 @@ from typing import Type
import cupy as cp
from mscclpp._mscclpp import (
Communicator,
Connection,
CppCommunicator,
CppConnection,
connect_nvls_collective,
EndpointConfig,
Semaphore,
ProxyService,
RegisteredMemory,
PortChannel,
MemoryChannel,
TcpBootstrap,
Transport,
TransportFlags,
CppEndpointConfig,
CppSemaphore,
CppProxyService,
CppRegisteredMemory,
CppPortChannel,
CppMemoryChannel,
CppTcpBootstrap,
CppTransport,
CppTransportFlags,
)
import mpi4py
import numpy as np
import pickle
from mscclpp.utils import is_torch_tensor
@@ -29,27 +29,47 @@ __all__ = ["CommGroup"]
class CommGroup:
def __init__(
self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
self,
mpi_comm: "mpi4py.MPI.Comm" = None,
torch_group: "dist.ProcessGroup" = None,
interfaceIpPortTrio: str = "",
rank: int = None,
size: int = None,
):
if interfaceIpPortTrio == "":
self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
uniq_id = None
if mpi_comm.rank == 0:
# similar to NCCL's unique id
rank, size = (
(mpi_comm.Get_rank(), mpi_comm.Get_size())
if mpi_comm is not None
else (torch_group.rank(), torch_group.size())
)
self.bootstrap = CppTcpBootstrap.create(rank, size)
if rank == 0:
uniq_id = self.bootstrap.create_unique_id()
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
if mpi_comm is not None:
import mpi4py
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
else:
import torch
import torch.distributed as dist
if rank == 0:
uniq_id_global = uniq_id
pickled_data = pickle.dumps(uniq_id)
data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
else:
data_tensor = torch.zeros(256, dtype=torch.uint8)
dist.broadcast(data_tensor, src=0, group=torch_group)
uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
self.bootstrap.initialize(uniq_id_global)
elif mpi_comm:
# use this instead
self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
self.bootstrap.initialize(interfaceIpPortTrio)
elif not interfaceIpPortTrio == "":
assert rank >= 0 and size >= 1
self.bootstrap = TcpBootstrap.create(rank, size)
self.bootstrap = CppTcpBootstrap.create(rank, size)
self.bootstrap.initialize(interfaceIpPortTrio)
else:
raise RuntimeError("Either the interface or mpi_group need to be specified")
self.communicator = Communicator(self.bootstrap)
self.communicator = CppCommunicator(self.bootstrap)
self.my_rank = self.bootstrap.get_rank()
self.nranks = self.bootstrap.get_n_ranks()
self.nranks_per_node = self.bootstrap.get_n_ranks_per_node()
@@ -63,43 +83,43 @@ class CommGroup:
def recv(self, tensor: np.ndarray, peer: int, tag: int):
self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
def my_ib_device(self, local_rank: int) -> Transport:
def my_ib_device(self, local_rank: int) -> CppTransport:
if local_rank == 0:
return Transport.IB0
return CppTransport.IB0
if local_rank == 1:
return Transport.IB1
return CppTransport.IB1
if local_rank == 2:
return Transport.IB2
return CppTransport.IB2
if local_rank == 3:
return Transport.IB3
return CppTransport.IB3
if local_rank == 4:
return Transport.IB4
return CppTransport.IB4
if local_rank == 5:
return Transport.IB5
return CppTransport.IB5
if local_rank == 6:
return Transport.IB6
return CppTransport.IB6
if local_rank == 7:
return Transport.IB7
return CppTransport.IB7
else:
assert False # only 8 IBs are supported
def make_connection(
self,
all_ranks: list[int],
endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport],
use_switch: bool = False,
) -> dict[int, Connection]:
if type(endpoints) is Transport:
endpoints = EndpointConfig(endpoints)
) -> dict[int, CppConnection]:
if type(endpoints) is CppTransport:
endpoints = CppEndpointConfig(endpoints)
elif type(endpoints) is dict:
endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()}
endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()}
connections = {}
for rank in all_ranks:
if type(endpoints) is dict:
endpoint = endpoints[rank]
else:
endpoint = endpoints
if endpoint.transport == Transport.CudaIpc and use_switch:
if endpoint.transport == CppTransport.CudaIpc and use_switch:
return connect_nvls_collective(self.communicator, all_ranks, 2**30)
else:
connections[rank] = self.communicator.connect(endpoint, rank)
@@ -107,8 +127,8 @@ class CommGroup:
return connections
def register_tensor_with_connections(
self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection]
) -> dict[int, RegisteredMemory]:
self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection]
) -> dict[int, CppRegisteredMemory]:
local_reg_memory = self.register_local_memory(tensor, connections)
all_registered_memories = {}
all_registered_memories[self.my_rank] = local_reg_memory
@@ -121,8 +141,8 @@ class CommGroup:
return all_registered_memories
def _register_memory_with_connections(
self, memory: RegisteredMemory, connections: dict[int, Connection]
) -> dict[int, RegisteredMemory]:
self, memory: CppRegisteredMemory, connections: dict[int, CppConnection]
) -> dict[int, CppRegisteredMemory]:
all_registered_memories = {}
all_registered_memories[self.my_rank] = memory
future_memories = {}
@@ -133,18 +153,20 @@ class CommGroup:
all_registered_memories[rank] = future_memories[rank].get()
return all_registered_memories
def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]:
def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]:
future_semaphores = {}
for rank in connections:
future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank)
return {rank: future.get() for rank, future in future_semaphores.items()}
def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
def make_memory_channels(
self, tensor: cp.ndarray, connections: dict[int, CppConnection]
) -> dict[int, CppMemoryChannel]:
semaphores = self.make_semaphores(connections)
registered_memories = self.register_tensor_with_connections(tensor, connections)
channels = {}
for rank in connections:
channels[rank] = MemoryChannel(
channels[rank] = CppMemoryChannel(
semaphores[rank], registered_memories[rank], registered_memories[self.my_rank]
)
return channels
@@ -152,9 +174,9 @@ class CommGroup:
def make_memory_channels_with_scratch(
self,
tensor: cp.ndarray,
registeredScratchBuffer: RegisteredMemory,
connections: dict[int, Connection],
) -> dict[int, MemoryChannel]:
registeredScratchBuffer: CppRegisteredMemory,
connections: dict[int, CppConnection],
) -> dict[int, CppMemoryChannel]:
semaphores = self.make_semaphores(connections)
registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
channels = {}
@@ -162,17 +184,17 @@ class CommGroup:
tensor_size = (
tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
)
local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags())
local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags())
scratch_data_ptr = registeredScratchBuffer.data()
for rank in connections:
channels[rank] = MemoryChannel(
channels[rank] = CppMemoryChannel(
semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr
)
return channels
def make_port_channels(
self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
) -> dict[int, PortChannel]:
self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
) -> dict[int, CppPortChannel]:
semaphores = self.make_semaphores(connections)
registered_memories = self.register_tensor_with_connections(tensor, connections)
memory_ids = {}
@@ -188,12 +210,12 @@ class CommGroup:
def make_port_channels_with_scratch(
self,
proxy_service: ProxyService,
proxy_service: CppProxyService,
tensor: cp.ndarray,
registeredScratchBuffer: RegisteredMemory,
connections: dict[int, Connection],
) -> dict[int, PortChannel]:
transport_flags = TransportFlags()
registeredScratchBuffer: CppRegisteredMemory,
connections: dict[int, CppConnection],
) -> dict[int, CppPortChannel]:
transport_flags = CppTransportFlags()
for rank in connections:
transport_flags |= connections[rank].transport()
data_ptr = (
@@ -223,8 +245,8 @@ class CommGroup:
return channels
def register_semaphore_with_proxy(
self, proxy_service: ProxyService, connections: dict[int, Connection]
) -> dict[int, PortChannel]:
self, proxy_service: CppProxyService, connections: dict[int, CppConnection]
) -> dict[int, CppPortChannel]:
semaphores = self.make_semaphores(connections)
semaphore_ids = {}
for rank in semaphores:
@@ -235,7 +257,7 @@ class CommGroup:
return channels
def register_memory_with_proxy(
self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection]
) -> dict[int, int]:
registered_memories = self.register_tensor_with_connections(tensor, connections)
memory_ids = {}
@@ -243,8 +265,8 @@ class CommGroup:
memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
return memory_ids
def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory:
transport_flags = TransportFlags()
def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory:
transport_flags = CppTransportFlags()
for rank in connections:
transport_flags |= connections[rank].transport()
data_ptr = (

View File

@@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram
from mscclpp.language.utils import AlgoSpec
from mscclpp.utils import get_device_arch
from mscclpp._mscclpp import (
ExecutionPlan,
)
from mscclpp._mscclpp import CppExecutionPlan, env
logging.basicConfig(level=logging.INFO)
@@ -51,7 +49,7 @@ class DslCompiler:
into execution plans that can be run on GPUs. The compiled plans are cached
to disk for reuse.
The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR`
The cache location can be configured via the `MSCCLPP_CACHE_DIR`
environment variable (defaults to `~/.cache/mscclpp`).
Example:
@@ -138,7 +136,7 @@ class DslCompiler:
)
).hexdigest()
plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp")
plan_dir = Path(env().cache_dir)
os.makedirs(plan_dir, exist_ok=True)
filename = f"{plan_id}.json"
plan_path = os.path.join(plan_dir, filename)
@@ -157,7 +155,7 @@ class DslCompiler:
os.remove(tmp_path)
except Exception:
Path(plan_path).unlink(missing_ok=True)
execution_plan = ExecutionPlan(plan_path, rank)
execution_plan = CppExecutionPlan(plan_path, rank)
return Algorithm(
id=plan_id,
execution_plan=execution_plan,
@@ -179,8 +177,8 @@ class NativeCodeCompiler:
based on the runtime environment. Compiled modules are cached to avoid
recompilation.
The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR`
environment variable (defaults to `~/.cache/mscclpp/native`).
The cache location can be configured via the `MSCCLPP_CACHE_DIR`
environment variable (defaults to `~/.cache/mscclpp`).
Attributes:
_is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA.
@@ -226,8 +224,7 @@ class NativeCodeCompiler:
"-L" + os.path.join(self._lib_home, "lib"),
"-lmscclpp",
]
cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native")
self._cache_dir = Path(cache_root)
self._cache_dir = Path(env().cache_dir) / "native"
self._cache_dir.mkdir(parents=True, exist_ok=True)
def _get_compiler(self) -> str:
@@ -283,7 +280,7 @@ class NativeCodeCompiler:
Note:
- The source file should include pybind11 bindings to expose functions.
- MSCCLPP headers are automatically included in the compilation.
- The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native).
- The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp).
- File locking is used to prevent race conditions during parallel compilation.
Example:

View File

@@ -3,12 +3,10 @@
from __future__ import annotations
from typing import Union
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
import atexit
from mscclpp._mscclpp import (
AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
)
from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
__all__ = ["AlgorithmCollectionBuilder"]
@@ -24,13 +22,14 @@ class AlgorithmCollectionBuilder:
@classmethod
def reset(cls):
if cls._instance is not None:
_AlgorithmCollectionBuilder.reset()
CppAlgorithmCollectionBuilder.reset()
cls._instance = None
def __init__(self):
if not hasattr(self, "_initialized"):
self._builder = _AlgorithmCollectionBuilder.get_instance()
self._builder = CppAlgorithmCollectionBuilder.get_instance()
self._initialized = True
self._flag_buffer = None
def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]):
if isinstance(algorithm_builder, AlgorithmBuilder):
@@ -52,8 +51,17 @@ class AlgorithmCollectionBuilder:
collection = self._builder.build()
return AlgorithmCollection(collection)
def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank)
def build_default_algorithms(
self,
scratch_buffer: int,
scratch_buffer_size: int,
rank: int,
) -> AlgorithmCollection:
if self._flag_buffer is None:
self._flag_buffer = get_flag_buffer()
native_collection = self._builder.build_default_algorithms(
int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
)
return AlgorithmCollection(native_collection)

View File

@@ -24,11 +24,11 @@ def _a2av_dbg(msg: str):
if _DEBUG_A2AV:
print(msg, file=sys.stderr, flush=True)
from mscclpp._mscclpp import (
Communicator,
TcpBootstrap,
DataType,
ReduceOp,
CommResult,
CppCommunicator as Communicator,
CppTcpBootstrap as TcpBootstrap,
CppDataType as DataType,
CppReduceOp as ReduceOp,
CppCommResult as CommResult,
)
from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder
@@ -375,6 +375,7 @@ class MscclppAlltoAllV:
None, # executor (not needed for native algos)
0, # nblocks (auto)
0, # nthreads_per_block (auto)
False, # symmetric_memory
self._extras,
)

View File

@@ -140,7 +140,7 @@ class MemoryChannel:
for tb_id in tb_list:
tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
tb_channel_ids = get_program().setup_channel(tb, self)
tb_channel_ids = get_program().setup_channel(tb_id, self)
op = GetOperation(
src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],

View File

@@ -534,6 +534,7 @@ class PutOperation(BaseOperation):
self.dst_buff = dst_buff
self.channel_ids = channel_ids
self.channel_type = channel_type
self.from_packet = from_packet
self.to_packet = to_packet
self.with_signal = with_signal
self.with_signal_and_flush = with_signal_and_flush
@@ -579,6 +580,25 @@ class PutOperation(BaseOperation):
with_signal=self.with_signal,
with_signal_and_flush=self.with_signal_and_flush,
)
elif (
isinstance(other, PutOperation)
and self.name == Instruction.read_put_packet
and self.name == other.name
and self.src_buff == other.src_buff
and self.channel_type == other.channel_type
and self.tbg_info == other.tbg_info
):
fused_operation = PutOperation(
src_buff=self.src_buff,
dst_buff=self.dst_buff + other.dst_buff,
channel_ids=self.channel_ids + other.channel_ids,
channel_type=self.channel_type,
tbg_info=self.tbg_info,
from_packet=self.from_packet,
to_packet=self.to_packet,
with_signal=self.with_signal,
with_signal_and_flush=self.with_signal_and_flush,
)
return fused_operation
@@ -725,7 +745,7 @@ class ReduceOperation(BaseOperation):
remote_dst_buff=self.remote_dst_buff + other.dst_buff,
channel_ids=self.channel_ids,
put_channel_ids=self.put_channel_ids + other.channel_ids,
channel_type=self.channel_type,
channel_type=other.channel_type,
reduce_operation=self.reduce_operation,
tbg_info=self.tbg_info,
packet=self.packet,

View File

@@ -0,0 +1,78 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language.channel import *
from mscclpp.language.rank import *
from mscclpp.language.general import *
from mscclpp.language.program import *
from mscclpp.language.collectives import *
def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
chunksperloop = 1
collective = AllGather(gpu_size, chunksperloop, True)
with CollectiveProgram(
name,
collective,
gpu_size,
protocol="LL",
num_threads_per_block=num_threads_per_block,
use_double_scratch_buffer=True,
min_message_size=min_message_size,
max_message_size=max_message_size,
):
# Creating Scratch Buffers
scratch_buffer = []
for gpu in range(gpu_size):
scratch_buffer.append(Buffer(gpu, 2 * gpu_size))
# Copying it to scratch buffer
for gpu in range(gpu_size):
rank = Rank(gpu)
scratch_offset = gpu_size
input_buffer = rank.get_input_buffer()
rank.copy_packets(
scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0
)
# Putting packets in the remote scratch buffer
for gpu in range(gpu_size):
rank = Rank(gpu)
output_buffer = rank.get_output_buffer()
for peer in range(1, gpu_size):
dst_rank = (gpu + peer) % gpu_size
ch = MemoryChannel(dst_rank, gpu)
tb = 0
ch.read_put_packets(
scratch_buffer[dst_rank][gpu : gpu + 1],
scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1],
tb,
)
# Copying packets from local scratch buffer to local buffer
for gpu in range(gpu_size):
rank = Rank(gpu)
output_buffer = rank.get_output_buffer()
for peer in range(1, gpu_size):
dst_rank = (gpu + peer) % gpu_size
rank.unpack_packets(
output_buffer[dst_rank : dst_rank + 1],
scratch_buffer[gpu][dst_rank : dst_rank + 1],
tb=0,
)
print(JSON())
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, help="name of the program")
parser.add_argument("--num_gpus", type=int, help="number of gpus")
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
args = parser.parse_args()
allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)

Some files were not shown because too many files have changed in this diff Show More