Merge branch 'main' into binyli/unique-qp-and-gid-index

Resolve conflicts in env.hpp, env_py.cpp, and env.cpp by combining both
branches' additions: keep main's MSCCLPP_FORCE_DISABLE_GDR field and
this branch's -1 sentinel default for MSCCLPP_IB_GID_INDEX.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-04-17 18:23:23 +00:00
183 changed files with 10229 additions and 3229 deletions

View File

@@ -0,0 +1,93 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: CodeCoverageA100
timeoutInMinutes: 40
pool:
name: msccl-ci
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
gpuArch: '80'
- job: CodeCoverageH100
timeoutInMinutes: 40
pool:
name: msccl-ci-h100
variables:
- group: mscclpp
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'
- job: CodeCoverageMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
variables:
- group: mscclpp
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/codecov.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942

View File

@@ -41,11 +41,10 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: IntegrationTestH100
@@ -61,10 +60,9 @@ jobs:
image: $(containerImage)
steps:
- template: templates/integration-test.yaml
- template: templates/integration-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'

View File

@@ -16,168 +16,109 @@ pr: none
parameters:
- name: vmssName
type: string
default: mscclpp-h100-multinode-ci
- name: hostEntries
type: string
default: |
10.0.0.10 mscclit-000000
10.0.0.11 mscclit-000001
10.0.0.5 mscclpp-h100-multinode-ci000000
10.0.0.4 mscclpp-h100-multinode-ci000001
jobs:
- job: MultiNodesTest
displayName: Multi nodes test
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: mscclpp-it
name: mscclpp-multi-node
container:
image: $[ variables['containerImage'] ]
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp-ssh.key
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: Bash@3
displayName: Add HostEntry
inputs:
targetType: 'inline'
script: |
ENTRY="${{ parameters.hostEntries }}"
if ! grep -qxF "$ENTRY" /etc/hosts; then
echo "Adding to /etc/hosts"
echo "$ENTRY" | sudo tee -a /etc/hosts
else
echo "Entry already exists, nothing to do."
fi
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclit-vmss --resource-group msccl-IT
while IFS= read -r line; do
[ -z "$line" ] && continue
if ! grep -qxF "$line" /etc/hosts; then
echo "Adding to /etc/hosts: $line"
echo "$line" | sudo tee -a /etc/hosts
else
echo "Entry already exists: $line"
fi
done <<< "${{ parameters.hostEntries }}"
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
displayName: Generate deploy files
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
kill $CHILD_PID
VMSS="${{ parameters.vmssName }}"
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
NODE0="${VMSS}000000"
NODE1="${VMSS}000001"
- task: Bash@3
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
kill $CHILD_PID
echo "Host ${NODE0}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host ${NODE1}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
- task: Bash@3
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
kill $CHILD_PID
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
- task: Bash@3
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
kill $CHILD_PID
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
- template: templates/deploy.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'
- template: templates/run-remote-task.yml
parameters:
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
continueOnError: true
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests
- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
- template: templates/stop.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp

View File

@@ -40,11 +40,11 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
- job: NcclTestH100
@@ -61,9 +61,9 @@ jobs:
image: $(containerImage)
steps:
- template: templates/nccl-test.yaml
- template: templates/nccl-test.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
nvccGencode: "-gencode=arch=compute_90,code=sm_90"

View File

@@ -40,9 +40,8 @@ jobs:
image: $(containerImage)
steps:
- template: templates/rccl-test.yaml
- template: templates/rccl-test.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
gpuArch: gfx942

View File

@@ -0,0 +1,110 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
buildType: Debug
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
buildDisplayName: 'Build with coverage'
buildName: BuildCoverage
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: TestsCoverageNonPerf
displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
export GCOV_PREFIX=/root/mscclpp
export GCOV_PREFIX_STRIP=$STRIP_COUNT
echo "Running unit_tests..."
./build/bin/unit_tests
echo "unit_tests: PASSED"
echo "Running mp_unit_tests -np 2..."
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 2: PASSED"
echo "Running mp_unit_tests -np 4..."
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
echo "mp_unit_tests -np 4: PASSED"
- template: run-remote-task.yml
parameters:
name: CaptureCoverage
displayName: Capture coverage data with lcov
remoteScript: |
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
GCOV_TOOL_ARG=""
if [ "${{ parameters.platform }}" = "rocm" ]; then
apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
GCOV_WRAPPER=$(mktemp)
printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
chmod +x "$GCOV_WRAPPER"
GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
fi
lcov --version
LCOV_CAPTURE_ARGS=""
if lcov --help 2>&1 | grep -q "inconsistent"; then
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
fi
lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
if [ ! -s coverage.info ]; then
echo "ERROR: coverage.info was not generated."
exit 1
fi
lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
lcov --list coverage.info
ls -la coverage.info
- task: Bash@3
name: FetchCoverage
displayName: Fetch coverage data from remote VM
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
HOST=$(head -1 ${HOSTFILE})
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UploadCodecov
displayName: Upload coverage to Codecov
inputs:
targetType: 'inline'
script: |
set -e
curl -Os https://cli.codecov.io/latest/linux/codecov
chmod +x codecov
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,151 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
# Build parameters
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
default: ''
- name: buildType
type: string
default: 'Release'
- name: buildTests
type: string
default: 'true'
- name: cmakeArgs
type: string
default: ''
- name: buildName
type: string
default: 'Build'
- name: buildDisplayName
type: string
default: 'Build'
# Deploy parameters
- name: deployArgs
type: string
default: ''
steps:
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
- task: Bash@3
name: EnsureAzureCLI
displayName: Ensure Azure CLI Installed
inputs:
targetType: inline
script: |
set -e
if command -v az >/dev/null 2>&1; then
az version >/dev/null
exit 0
fi
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
# 1. Build
- task: Bash@3
name: ${{ parameters.buildName }}
displayName: ${{ parameters.buildDisplayName }}
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
BUILD_TESTS_ARG=""
if [ "${{ parameters.buildTests }}" = "true" ]; then
BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
fi
GPU_ARCH_ARG=""
if [ -n "${{ parameters.gpuArch }}" ]; then
GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
fi
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
if [ "${{ parameters.platform }}" = "rocm" ]; then
eval CXX=/opt/rocm/bin/hipcc cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_ROCM=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
else
eval cmake \
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
${BUILD_TESTS_ARG} \
${GPU_ARCH_ARG} \
${CMAKE_EXTRA_ARGS} ..
fi
make -j
cd ..
pwd > build/BUILD_PREFIX
echo "=== Build artifacts ==="
ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
du -sh build/bin/* 2>/dev/null || true
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 2. Write CMake args for pip install on remote VMs
- task: Bash@3
name: WritePipCmakeArgs
displayName: Write pip CMake args
inputs:
targetType: 'inline'
script: |
set -e
PIP_CMAKE_ARGS=""
if [ -n "${{ parameters.gpuArch }}" ]; then
PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
fi
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
fi
echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
echo "pip CMake args: $(cat pip_cmake_args.txt)"
workingDirectory: '$(System.DefaultWorkingDirectory)'
# 3. Download SSH key + install packages + start VMSS
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
# 4. Deploy test environment
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: ${{ parameters.deployArgs }}
workingDirectory: '$(System.DefaultWorkingDirectory)'

View File

@@ -1,242 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllGatherTest
displayName: Run mscclpp AllGather test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: SendRecvTest
displayName: Run mscclpp SendRecv test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllReduceTest
displayName: Run mscclpp AllReduce test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllToAll
displayName: Run mscclpp AllToAll test
inputs:
targetType: 'inline'
script: |
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: CheckPerfNumber
displayName: Check collective primitives performance
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
python3 -m pip install .; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: FifoPerfBenchmark
displayName: FIFO Performance Benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
./build/bin/perf/fifo_test"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,76 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: AllGatherTest
displayName: Run mscclpp AllGather test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: SendRecvTest
displayName: Run mscclpp SendRecv test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllReduceTest
displayName: Run mscclpp AllReduce test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: AllToAll
displayName: Run mscclpp AllToAll test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- template: run-remote-task.yml
parameters:
name: CheckPerfNumber
displayName: Check collective primitives performance
remoteScript: |
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
- template: run-remote-task.yml
parameters:
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
remoteScript: |
python3 -m pip install .
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,282 +0,0 @@
# .azure-pipelines/templates/nccl-test.yaml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# sshKeySecureFile the secureFile name for your SSH key
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: nccltest-single-node
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: GenerateExecutionFile
# displayName: Generate execution file
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp/msccl-users; \
# mkdir -p execution-files; \
# cd /root/mscclpp/msccl-users; \
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNcclTests
displayName: Install NCCL Tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllReduceTest
# displayName: Run NCCL AllReduce Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclAllGatherTest
# displayName: Run NCCL AllGather Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterTest
# displayName: Run NCCL Reduce Scatter Test
# inputs:
# targetType: inline
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: InstallNccl
displayName: Install NCCL
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
cd nccl; \
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
# - task: Bash@3
# name: RunNcclReduceScatterFallbaclkToNcclTest
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
# inputs:
# targetType: 'inline'
# script: |
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
# cd /root/mscclpp; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,80 @@
# .azure-pipelines/templates/nccl-test.yml
# ----------------------------------------
# A steptemplate that runs the entire MSCCLPP→NCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
default: '80'
- name: nvccGencode
type: string
default: "-gencode=arch=compute_80,code=sm_80"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'nccltest-single-node'
- template: run-remote-task.yml
parameters:
name: InstallNcclTests
displayName: Install NCCL Tests
remoteScript: |
cd
git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: InstallNccl
displayName: Install NCCL
remoteScript: |
LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
if [ -z "$LATEST_TAG" ]; then
echo "Failed to fetch latest NCCL tag"
exit 1
fi
cd
git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
- template: run-remote-task.yml
parameters:
name: RunNcclAllGatherFallbaclkToNcclTest
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclAllReduceFallbaclkToNcclTest
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunNcclBroadcastFallbaclkToNcclTest
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,142 +0,0 @@
# .azure-pipelines/templates/rccl-test.yaml
# ------------------------------------------------
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# sshKeySecureFile the secureFile name for your SSH key
# gpuArch GPU architecture (e.g. gfx942)
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true rocm"
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: InstallRcclTests
displayName: Install RCCL Tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; \
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
cd rocm-systems; \
git sparse-checkout init --cone; \
git sparse-checkout set projects/rccl-tests; \
git checkout; \
cd projects/rccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunRcclAllGatherTest
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: RunRcclAllReduceTest
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,63 @@
# .azure-pipelines/templates/rccl-test.yml
# ------------------------------------------------
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
#
# Parameters:
# subscription Azure subscription to use for VMSS start/stop
# vmssName VMSS name to start/stop
# gpuArch GPU architecture (e.g. gfx942)
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
default: "gfx942"
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: rocm
gpuArch: ${{ parameters.gpuArch }}
buildTests: false
deployArgs: 'single-node-test true rocm'
- template: run-remote-task.yml
parameters:
name: InstallRcclTests
displayName: Install RCCL Tests
remoteScript: |
cd
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
cd rocm-systems
git sparse-checkout init --cone
git sparse-checkout set projects/rccl-tests
git checkout
cd projects/rccl-tests
MPI=1 MPI_HOME=/usr/local/mpi make -j
- template: run-remote-task.yml
parameters:
name: RunRcclAllGatherTest
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: run-remote-task.yml
parameters:
name: RunRcclAllReduceTest
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
remoteScript: |
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -0,0 +1,31 @@
parameters:
- name: name
type: string
default: ''
- name: displayName
type: string
- name: runRemoteArgs
type: string
default: ''
- name: remoteScript
type: string
- name: workingDirectory
type: string
default: '$(System.DefaultWorkingDirectory)'
- name: continueOnError
type: boolean
default: false
steps:
- task: Bash@3
${{ if ne(parameters.name, '') }}:
name: ${{ parameters.name }}
displayName: ${{ parameters.displayName }}
continueOnError: ${{ parameters.continueOnError }}
inputs:
targetType: 'inline'
script: |
test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
${{ parameters.remoteScript }}
REMOTE_CMD
workingDirectory: ${{ parameters.workingDirectory }}

View File

@@ -0,0 +1,20 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: resourceGroup
type: string
default: mscclpp
steps:
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}

View File

@@ -0,0 +1,42 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: ExecutorTest
displayName: Run executor tests
remoteScript: |
python3 -m pip install .
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,191 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: UnitTests
displayName: Run mscclpp unit tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: StopContainer
displayName: Stop existing container
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: BuildWithIb
displayName: Rebuild with IB
inputs:
targetType: 'inline'
script: |
rm -rf build && mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: DeployTestEnvWithIb
displayName: Deploy Test Env (with IB build)
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- task: Bash@3
name: PyTestsWithIbBuildDisableIb
displayName: Run pytests (IB build, IB tests disabled)
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,95 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
deployArgs: 'single-node-test false'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: run-remote-task.yml
parameters:
name: StopContainer
displayName: Stop existing container
runRemoteArgs: '--no-docker --no-log'
remoteScript: |
sudo docker stop mscclpp-test || true
sudo docker rm mscclpp-test || true
- task: Bash@3
displayName: Remove generated SSH key files
inputs:
targetType: 'inline'
script: |
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: BuildWithIb
displayName: Rebuild with IB
inputs:
targetType: 'inline'
script: |
set -e
rm -rf build
mkdir -p build && cd build
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
-DMSCCLPP_USE_CUDA=ON \
-DMSCCLPP_BUILD_TESTS=ON \
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: DeployTestEnvWithIb
displayName: Deploy Test Env (with IB build)
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: single-node-test false
workingDirectory: $(System.DefaultWorkingDirectory)
- template: run-remote-task.yml
parameters:
name: PyTestsWithIbBuildDisableIb
displayName: Run pytests (IB build, IB tests disabled)
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,145 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: gpuArch
type: string
steps:
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
set -e; \
cd /root/mscclpp; \
mkdir -p build && cd build; \
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
make -j"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: 'inline'
script: |
# set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
export PATH=/usr/local/mpi/bin:\$PATH; \
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,57 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
deployArgs: 'single-node-test'
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,142 +0,0 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
if [ "${{ parameters.platform }}" == "rocm" ]; then
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
else
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
fi
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test true ${{ parameters.platform }}"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: UnitTests
displayName: Run mscclpp unit tests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
./build/bin/unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: PyTests
displayName: Run pytests
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -0,0 +1,49 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: platform
type: string
default: 'cuda'
- name: gpuArch
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
platform: ${{ parameters.platform }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test true ${{ parameters.platform }}'
- template: run-remote-task.yml
parameters:
name: UnitTests
displayName: Run mscclpp unit tests
remoteScript: |
./build/bin/unit_tests
- template: run-remote-task.yml
parameters:
name: MpUnitTests
displayName: Run mscclpp multi-process unit tests
remoteScript: |
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- template: run-remote-task.yml
parameters:
name: PyTests
displayName: Run pytests
remoteScript: |
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}

View File

@@ -1,50 +0,0 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- apps/**
- docker/**
- docs/**
- '**/*.md'
jobs:
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
sshKeySecureFile: mscclpp.pem
platform: rocm
gpuArch: gfx942

View File

@@ -37,17 +37,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestWithNpKitA100
@@ -59,17 +58,16 @@ jobs:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'
- job: UnitTestH100
@@ -79,17 +77,16 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut.yaml
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestWithNpKitH100
@@ -99,17 +96,16 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-npkit.yaml
- template: templates/ut-npkit.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestNoIBEnv
@@ -121,15 +117,55 @@ jobs:
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-no-ib-env.yaml
- template: templates/ut-no-ib-env.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '90'
- job: UnitTestMI300X
timeoutInMinutes: 40
pool:
name: msccl-ci-mi300x
strategy:
matrix:
rocm6_2:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
container:
image: $(containerImage)
steps:
- template: templates/ut.yml
parameters:
subscription: mscclpp-ci-mi300x
vmssName: mscclpp-mi300x-ci
platform: rocm
gpuArch: gfx942
- job: UnitTestExecutor
timeoutInMinutes: 60
displayName: Test DSL Executor
pool:
name: msccl-ci-h100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
container:
image: $(containerImage)
steps:
- template: templates/ut-executor.yml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
gpuArch: '90'

24
.codecov.yml Normal file
View File

@@ -0,0 +1,24 @@
codecov:
require_ci_to_pass: yes
coverage:
status:
project:
default:
target: 68%
threshold: 1%
patch:
default:
target: 80%
flag_management:
default_rules:
carryforward: true
ignore:
- "test/"
- "examples/"
- "python/"
- "tools/"
- "docs/"
- "docker/"

View File

@@ -43,7 +43,7 @@ For testing after successful build:
# To run tests with two GPUs - two is enough for most tests
mpirun -np 2 ./build/bin/mp_unit_tests
# To run tests excluding IB-related ones (when IB is not available)
mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
```
For building a Python package:

View File

@@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- name: Checkout repository
@@ -62,7 +62,7 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis
@@ -107,7 +107,7 @@ jobs:
- name: Build
run: |
rm -rf build && mkdir build && cd build
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
make -j4
- name: Perform CodeQL Analysis

View File

@@ -1,69 +0,0 @@
name: IntegrationTest
on: workflow_dispatch
jobs:
IntegrationTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: Run mscclpp AllGather test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl

View File

@@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ 'cuda11.8', 'cuda12.8' ]
version: [ 'cuda11.8', 'cuda12.9' ]
steps:
- uses: actions/checkout@v4

View File

@@ -1,52 +0,0 @@
name: UnitTest
on: workflow_dispatch
jobs:
UnitTest:
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
timeout-minutes: 30
strategy:
matrix:
cuda: [ cuda11.8, cuda12.2 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}
- name: LockGPUClock
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: UnitTests
run: |
./build/bin/unit_tests
- name: MpUnitTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
- name: PyTests
run: |
set -e
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
.vscode/
build/
build_coverage/
__pycache__
.*.swp
*.so

View File

@@ -1,5 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CXX)
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
@@ -99,6 +100,62 @@ else()
message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
endif()
endif()
# Code coverage setup
if(MSCCLPP_ENABLE_COVERAGE)
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
message(STATUS "Code coverage enabled")
# Add coverage flags to C++ targets only (not CUDA)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
# Find lcov
find_program(LCOV_PATH lcov)
if(NOT LCOV_PATH)
message(WARNING "lcov not found. Install lcov to generate coverage reports.")
endif()
if(LCOV_PATH)
# Add coverage target
add_custom_target(coverage
COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
COMMAND ${LCOV_PATH} --directory . --zerocounters
COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Generating code coverage report"
)
# Add coverage clean target
add_custom_target(coverage-clean
COMMAND ${CMAKE_COMMAND} -E remove coverage.info
COMMAND ${LCOV_PATH} --directory . --zerocounters
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Cleaning coverage data"
)
endif()
else()
message(WARNING "Code coverage is only supported with GCC or Clang compilers")
endif()
endif()
if(MSCCLPP_GPU_ARCHS)
string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
@@ -167,9 +224,30 @@ if(MSCCLPP_USE_IB)
if(NOT IBVERBS_FOUND)
message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
endif()
find_package(MLX5)
if(MLX5_FOUND)
message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
else()
message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
endif()
endif()
find_package(NUMA REQUIRED)
find_package(Threads REQUIRED)
option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
if(MSCCLPP_USE_ROCM)
set(MSCCLPP_USE_GDRCOPY OFF)
endif()
if(MSCCLPP_USE_GDRCOPY)
find_package(GDRCopy)
if(NOT GDRCOPY_FOUND)
message(STATUS "GDRCopy not found, disabling GDRCopy support")
set(MSCCLPP_USE_GDRCOPY OFF)
else()
message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
endif()
endif()
include(FetchContent)
FetchContent_Declare(json
GIT_REPOSITORY https://github.com/nlohmann/json.git

View File

@@ -3,15 +3,16 @@
[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/)
[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp)
| Testing Pipelines | Build Status |
|--------------------------|-------------------|
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
| NCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
| RCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
A GPU-driven communication stack for scalable AI applications.

View File

@@ -1 +1 @@
0.8.0
0.9.0

50
cmake/FindGDRCopy.cmake Normal file
View File

@@ -0,0 +1,50 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
#
# The following variables are optionally searched for defaults
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
# The following are set after configuration is done:
# GDRCOPY_FOUND
# GDRCOPY_INCLUDE_DIRS
# GDRCOPY_LIBRARIES
find_path(GDRCOPY_INCLUDE_DIRS
NAMES gdrapi.h
HINTS
${GDRCOPY_INCLUDE_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/include
/usr/local/include
/usr/include)
find_library(GDRCOPY_LIBRARIES
NAMES gdrapi
HINTS
${GDRCOPY_LIB_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/lib
/usr/local/lib
/usr/lib
/usr/lib/x86_64-linux-gnu)
if(GDRCOPY_INCLUDE_DIRS)
include(CheckSymbolExists)
set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
unset(CMAKE_REQUIRED_LIBRARIES)
unset(CMAKE_REQUIRED_INCLUDES)
if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
endif()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)

38
cmake/FindMLX5.cmake Normal file
View File

@@ -0,0 +1,38 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Find the MLX5 Direct Verbs (mlx5dv) library
#
# The following variables are optionally searched for defaults
# MLX5_ROOT_DIR: Base directory where all MLX5 components are found
# MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
# MLX5_LIB_DIR: Directory where MLX5 libraries are found
# The following are set after configuration is done:
# MLX5_FOUND
# MLX5_INCLUDE_DIRS
# MLX5_LIBRARIES
find_path(MLX5_INCLUDE_DIRS
NAMES infiniband/mlx5dv.h
HINTS
${MLX5_INCLUDE_DIR}
${MLX5_ROOT_DIR}
${MLX5_ROOT_DIR}/include
/usr/local/include
/usr/include)
find_library(MLX5_LIBRARIES
NAMES mlx5
HINTS
${MLX5_LIB_DIR}
${MLX5_ROOT_DIR}
${MLX5_ROOT_DIR}/lib
/usr/local/lib
/usr/lib
/usr/lib/x86_64-linux-gnu)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)

View File

@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install lcov 2.2
RUN LCOV_VERSION="2.2" && \
apt-get update && \
apt-get install -y --no-install-recommends \
cpanminus \
gcc \
make \
perl \
&& \
cpanm --notest \
Capture::Tiny \
DateTime \
JSON::XS \
Memory::Process \
TimeDate \
&& \
cd /tmp && \
curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
tar xzf lcov.tar.gz && \
cd lcov-${LCOV_VERSION} && \
make install && \
cd / && rm -rf /tmp/lcov* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install CMake 3.26.4
RUN OS_ARCH=$(uname -m) && \
CMAKE_VERSION="3.26.4" && \
@@ -24,8 +49,25 @@ RUN OS_ARCH=$(uname -m) && \
rm -rf ${CMAKE_HOME}.tar.gz && \
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
# Install ROCm-specific packages if building for ROCm
# Install GDRCopy userspace library for CUDA targets
ARG TARGET="cuda13.0"
RUN if echo "$TARGET" | grep -q "^cuda"; then \
GDRCOPY_VERSION="2.5.2" && \
apt-get update -y && \
apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
cd /tmp && \
curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
tar xzf gdrcopy.tar.gz && \
cd gdrcopy-${GDRCOPY_VERSION}/packages && \
./build-deb-packages.sh -k -t && \
dpkg -i libgdrapi_*.deb && \
cd / && rm -rf /tmp/gdrcopy* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*; \
fi
# Install ROCm-specific packages if building for ROCm
RUN if echo "$TARGET" | grep -q "^rocm"; then \
apt-get update -y && \
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
@@ -47,7 +89,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
fi && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r python/requirements_${target_type}.txt
pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
pip install --no-cache-dir coverage xlsxwriter
# Cleanup
RUN rm -rf /tmp/mscclpp

View File

@@ -4,27 +4,22 @@ set -e
declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
)
declare -A extraLdPathTable
extraLdPathTable=(
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
["rocm6.2"]="/opt/rocm/lib"
)
declare -A ofedVersionTable
ofedVersionTable=(
["cuda11.8"]="23.07-0.5.1.2"
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
@@ -36,7 +31,7 @@ TARGET=${1}
OS_ARCH=$(uname -m)
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then

View File

@@ -5,7 +5,7 @@
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SPHINXMULTIVERSION ?= sphinx-multiversion
SPHINXMULTIVERSION ?= python3 build_multiversion.py
SOURCEDIR = .
BUILDDIR = _build

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Wrapper around sphinx-multiversion that patches copy_tree to generate
_version.py in each tag checkout. This is needed because setuptools_scm
generates _version.py at build time, but sphinx-multiversion uses
`git archive` which only contains committed files.
Usage (called by Makefile):
python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
"""
import os
import re
import subprocess
import sys
import sphinx_multiversion.git as smv_git
from sphinx_multiversion import main as smv_main
# Save the original copy_tree
_original_copy_tree = smv_git.copy_tree
def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
"""Call original copy_tree, then generate _version.py from the VERSION file."""
_original_copy_tree(gitroot, src, dst, reference, sourcepath)
# Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
refname = getattr(reference, "refname", "") or ""
match = re.search(r"v(\d+\.\d+\.\d+)", refname)
if not match:
return
version = match.group(1)
version_py_dir = os.path.join(dst, "python", "mscclpp")
if os.path.isdir(version_py_dir):
version_py = os.path.join(version_py_dir, "_version.py")
if not os.path.exists(version_py):
with open(version_py, "w") as f:
f.write(f'__version__ = "{version}"\n')
# Monkey-patch
smv_git.copy_tree = _patched_copy_tree
if __name__ == "__main__":
sys.exit(smv_main(sys.argv[1:]))

View File

@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
python3 -m mscclpp --install
```
This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
## Your First Algorithm: AllGather
Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.

View File

@@ -59,6 +59,9 @@ After installation, the generated JSON execution plan can be found at:
~/.cache/mscclpp/default/
```
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
**Performance Results:**
The figure below shows the performance characteristics for small message sizes in a two-node configuration:

View File

@@ -332,7 +332,8 @@ public:
size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, mscclpp::ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) {
const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
},
// Context initialization function

View File

@@ -25,12 +25,15 @@
```bash
sudo apt-get install libnuma-dev
```
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package
```bash
sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)"
```
If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
* (Optional, for benchmarks) MPI
* (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
* GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
* Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
* Others
* For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
* For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
@@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
```bash
# For NVIDIA platforms
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
# For AMD platforms
$ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
```
@@ -97,13 +100,30 @@ There are a few optional CMake options you can set:
(install-from-source-python-module)=
## Install from Source (Python Module)
Python 3.8 or later is required.
Python 3.10 or later is required.
```bash
# For NVIDIA platforms
$ python -m pip install .
# For AMD platforms, set the C++ compiler to HIPCC
$ CXX=/opt/rocm/bin/hipcc python -m pip install .
# For NVIDIA platforms (specify your CUDA version)
$ python -m pip install ".[cuda12]"
# For AMD platforms
$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
```
> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
Optional extras can be installed by specifying them in brackets. Available extras:
- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
```bash
# Example: install with CUDA 12 and benchmark extras
$ python -m pip install ".[cuda12,benchmark]"
# Example: install with all extras for testing on CUDA 12
$ python -m pip install ".[cuda12,benchmark,test]"
```
(vscode-dev-container)=
@@ -155,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
```bash
# Choose `requirements_*.txt` according to your CUDA/ROCm version.
$ python3 -m pip install -r ./python/requirements_cuda12.txt
# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
$ python3 -m pip install ".[cuda12,benchmark,test]"
$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
```
@@ -171,7 +192,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
@@ -189,13 +209,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
```bash
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
```

View File

@@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,

View File

@@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) {
return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,

View File

@@ -1,193 +1,117 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
import os
import torch
import mscclpp.utils as mscclpp_utils
import mscclpp
import mscclpp.ext
import netifaces as ni
import ipaddress
import netifaces as ni
import torch
import mscclpp
import mscclpp.ext
import mscclpp.utils as mscclpp_utils
def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
return collection_builder.build_default_algorithms(
scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
# -- Helpers ------------------------------------------------------------------
def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
"""Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
# PyTorch's from_dlpack does not support certain float8 DLPack type codes.
# Work around by importing as uint8 and reinterpreting via .view().
_DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
if dtype in _DLPACK_UNSUPPORTED:
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
return torch.utils.dlpack.from_dlpack(dlpack)
def _load_algorithms(scratch: torch.Tensor, rank: int):
return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
scratch_buffer=scratch.data_ptr(),
scratch_buffer_size=scratch.nbytes,
rank=rank,
)
def interfaces_for_ip_netifaces(ip: str):
def _interfaces_for_ip(ip: str):
target = ipaddress.ip_address(ip)
for interface in ni.interfaces():
addresses = ni.ifaddresses(interface)
if ni.AF_INET in addresses:
for link in addresses[ni.AF_INET]:
if "addr" in link:
addr = ipaddress.ip_address(link["addr"])
if addr == target:
return interface
for iface in ni.interfaces():
addrs = ni.ifaddresses(iface)
if ni.AF_INET in addrs:
for link in addrs[ni.AF_INET]:
if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
return iface
return None
def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
if op == torch.distributed.ReduceOp.SUM:
return mscclpp.ReduceOp.SUM
elif op == torch.distributed.ReduceOp.MIN:
if op == torch.distributed.ReduceOp.MIN:
return mscclpp.ReduceOp.MIN
else:
raise ValueError(f"unsupported op: {op}")
raise ValueError(f"unsupported op: {op}")
def _round_pow2(size: int) -> int:
"""Round up to next power-of-2, clamped to [1024, 256 MB]."""
size = max(size, 1024)
size = min(size, 256 << 20)
return 1 << (size - 1).bit_length()
# -- CustomizedComm -----------------------------------------------------------
class CustomizedComm:
def __init__(self, comm: mscclpp.CommGroup):
"""Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
_TUNE_N_WARMUP = 5
_TUNE_N_GRAPH_LAUNCHES = 10
_TUNE_N_OPS_PER_GRAPH = 100
_CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
_CANDIDATE_NTHREADS = [512, 768, 1024]
_NBLOCKS_LIMIT = {
"default_allreduce_nvls_packet": 16,
"default_allreduce_packet": 56,
"default_allreduce_allpair_packet": 56,
"default_allreduce_fullmesh": 64,
"default_allgather_fullmesh2": 32,
}
def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
self.comm = comm
self.rank = comm.my_rank
self.world_size = comm.nranks
self.local_rank = comm.my_rank % comm.nranks_per_node
self.n_ranks_per_node = comm.nranks_per_node
dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
self._algorithm_nvls_packet = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
][0]
self._algorithm_rsag_zero_copy = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
][0]
self._algorithm_packet = [
algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
][0]
if mscclpp.is_nvls_supported():
self._algorithm_nvls_zero_copy = [
algo
for algo in algorithms
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
][0]
self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
self.symmetric_memory = symmetric_memory
self._nvls = mscclpp.is_nvls_supported()
def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
sizes = [1 << i for i in range(10, 28)]
# Pre-fill with defaults for barrier
self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
self._scratch = _make_tensor(1 << 27, torch.float16)
self._barrier_tensor = _make_tensor(4096, torch.float32)
tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
tune_tensor.normal_()
candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
candidates_nthreads = [512, 768, 1024]
algos = _load_algorithms(self._scratch, self.rank)
self._algos = {(a.collective, a.name): a for a in algos}
for size in sizes:
algos = []
if mscclpp.is_nvls_supported():
algos.append(self._algorithm_nvls_zero_copy)
if size <= 4 * 1024 * 1024:
algos.append(self._algorithm_nvls_packet)
algos.append(self._algorithm_packet)
if size >= 512 * 1024:
algos.append(self._algorithm_rsag_zero_copy)
# {collective: {rounded_size: (algo, nblocks, nthreads)}}
self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
self._tune_buf = None
self._time_buf = None
best_time = float("inf")
best_config = None
def _algo(self, collective: str, name: str):
return self._algos.get((collective, name))
for algo in algos:
for nb in candidates_nblocks:
if algo.name == "default_allreduce_nvls_packet" and nb > 16:
continue
if algo.name == "default_allreduce_packet" and nb > 56:
continue
for nt in candidates_nthreads:
if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
continue
def _default_ar_config(self):
"""Fallback allreduce config for barrier / timing sync."""
pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
if self._nvls and pkt:
return (pkt, 0, 0)
return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
for _ in range(n_warmup):
self._run_algo(algo, tune_tensor, size, nb, nt)
self.barrier()
# -- low-level execute --
capture_stream = torch.cuda.Stream()
capture_stream.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
# Warmup on capture stream
with torch.cuda.stream(capture_stream):
self._run_algo(algo, tune_tensor, size, nb, nt)
capture_stream.synchronize()
with torch.cuda.graph(g, stream=capture_stream):
for _ in range(n_ops_per_graph):
self._run_algo(algo, tune_tensor, size, nb, nt)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record(capture_stream)
with torch.cuda.stream(capture_stream):
for _ in range(n_graph_launches):
g.replay()
end_event.record(capture_stream)
end_event.synchronize()
elapsed = start_event.elapsed_time(end_event)
# Synchronize timing results across all ranks to ensure consistent algorithm selection
# replicate n times such due to algo limitations
time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
dtype=torch.float32
)
torch.cuda.current_stream().wait_stream(capture_stream)
# TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
# May change to broadcast in the future if that becomes an issue.
self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
avg_time = time_tensor[self.rank].item() / self.world_size
if avg_time < best_time:
best_time = avg_time
best_config = (algo, nb, nt)
if best_config:
self.best_configs[size] = best_config
if self.rank == 0:
print(
f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
)
# reset the algorithms after tuning
torch.cuda.synchronize()
for algo in algos:
algo.reset()
def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
return algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
output_buffer=tensor.data_ptr(),
input_size=size,
output_size=size,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=mscclpp.ReduceOp.SUM,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nblocks,
nthreads_per_block=nthreads,
symmetric_memory=True,
)
def get_tuned_config(self, size):
if size < 1024:
target_size = 1024
elif size > 256 * 1024 * 1024:
target_size = 256 * 1024 * 1024
else:
target_size = 1 << (size - 1).bit_length()
return self.best_configs.get(target_size)
def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
assert op == torch.distributed.ReduceOp.SUM
config = self.get_tuned_config(tensor.nbytes)
algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
ret = algo.execute(
comm=self.comm.communicator,
input_buffer=tensor.data_ptr(),
@@ -195,107 +119,357 @@ class CustomizedComm:
input_size=tensor.nbytes,
output_size=tensor.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
op=to_mscclpp_reduce_op(op),
stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
nblocks=nblocks,
nthreads_per_block=nthreads,
symmetric_memory=True,
op=op,
stream=s,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=sym,
accum_dtype=accum_dtype,
)
if ret != 0:
print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
print(f"Rank {self.rank}: {algo.name} failed ({ret})")
return ret
def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
if sym is None:
sym = self.symmetric_memory
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
ret = algo.execute(
comm=self.comm.communicator,
input_buffer=inp.data_ptr(),
output_buffer=out.data_ptr(),
input_size=inp.nbytes,
output_size=out.nbytes,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
op=mscclpp.ReduceOp.NOP,
stream=s,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=sym,
)
if ret != 0:
print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
return ret
def _barrier_internal(self):
a, nb, nt = self._default_ar_config()
self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
# -- lazy tuning --
def _ensure_tune_bufs(self):
if self._tune_buf is None:
self._tune_buf = _make_tensor(1 << 27, torch.float16)
self._tune_buf.normal_()
self._time_buf = _make_tensor(4096, torch.float32)
return self._tune_buf
def _ar_candidates(self, size: int):
out = []
if size <= 4 << 20:
a = self._algo("allreduce", "default_allreduce_nvls_packet")
if self._nvls and a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_packet")
if a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_allpair_packet")
if a:
out.append(a)
if size >= 512 << 10:
a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
if self._nvls and self.symmetric_memory and a:
out.append(a)
a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
if a:
out.append(a)
if torch.version.hip is not None:
a = self._algo("allreduce", "default_allreduce_fullmesh")
if a:
out.append(a)
return out
def _ag_candidates(self):
a = self._algo("allgather", "default_allgather_fullmesh2")
return [a] if a else []
def _run_tune(self, collective, algo, buf, size, nb, nt):
"""Single tune invocation for either collective."""
if collective == "allreduce":
return algo.execute(
comm=self.comm.communicator,
input_buffer=buf.data_ptr(),
output_buffer=buf.data_ptr(),
input_size=size,
output_size=size,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
op=mscclpp.ReduceOp.SUM,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=True,
)
else:
total = size * self.world_size
out_ptr = buf.data_ptr()
return algo.execute(
comm=self.comm.communicator,
input_buffer=out_ptr + self.rank * size,
output_buffer=out_ptr,
input_size=size,
output_size=total,
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
op=mscclpp.ReduceOp.NOP,
stream=torch.cuda.current_stream().cuda_stream,
nblocks=nb,
nthreads_per_block=nt,
symmetric_memory=False,
)
def _tune_size(self, collective: str, target_size: int):
"""Auto-tune one (collective, target_size) pair and cache result."""
buf = self._ensure_tune_bufs()
cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
best_time, best_cfg = float("inf"), None
used = set()
run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
for algo in cands:
nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
for nb in self._CANDIDATE_NBLOCKS:
if nb > nb_limit:
continue
for nt in self._CANDIDATE_NTHREADS:
# Feasibility — sync result across ranks so all agree
ret = run(algo, nb, nt)
torch.cuda.synchronize()
self._time_buf[0] = float(ret)
self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
if self._time_buf[0].item() != 0:
continue
used.add(algo)
# Warmup
for _ in range(self._TUNE_N_WARMUP):
run(algo, nb, nt)
# CUDA-graph timed benchmark
cs = torch.cuda.Stream()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(self._TUNE_N_OPS_PER_GRAPH):
run(algo, nb, nt)
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
start.record(cs)
with torch.cuda.stream(cs):
for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
g.replay()
end.record(cs)
end.synchronize()
elapsed = start.elapsed_time(end)
# Cross-rank timing sync
self._time_buf.fill_(elapsed)
torch.cuda.current_stream().wait_stream(cs)
self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
avg = self._time_buf[self.rank].item() / self.world_size
if avg < best_time:
best_time, best_cfg = avg, (algo, nb, nt)
if best_cfg:
self._tune_cache[collective][target_size] = best_cfg
if self.rank == 0:
n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
print(
f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
flush=True,
)
else:
fb = (
self._default_ar_config()
if collective == "allreduce"
else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
)
self._tune_cache[collective][target_size] = fb
torch.cuda.synchronize()
self._barrier_internal()
for a in used:
a.reset()
# -- public API --
def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
sz = _round_pow2(tensor.nbytes)
if sz not in self._tune_cache["allreduce"]:
self._tune_size("allreduce", sz)
a, nb, nt = self._tune_cache["allreduce"][sz]
self._exec_ar(
tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
)
def all_gather(self, output_tensor, input_tensor, stream=None):
sz = _round_pow2(input_tensor.nbytes)
if sz not in self._tune_cache["allgather"]:
self._tune_size("allgather", sz)
a, nb, nt = self._tune_cache["allgather"][sz]
self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
def barrier(self):
tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
low = 5 * 1024
high = 80 * 1024 * 1024
sizes = []
curr = low
while curr <= high:
sizes.append(curr)
curr *= 2
if self.rank == 0:
print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
dtype = torch.float16
capture_stream = torch.cuda.Stream()
# Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
# Cannot allocate per-size tensors with symmetric memory.
bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
bench_buf.normal_()
for size in sizes:
n_elements = size // bench_buf.element_size()
tensor = bench_buf[:n_elements]
capture_stream.wait_stream(torch.cuda.current_stream())
# Capture Graph
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=capture_stream):
for _ in range(n_iter_per_graph):
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
# warmup: Execute the graph once to prime the driver
with torch.cuda.stream(capture_stream):
for _ in range(n_warmup):
g.replay()
self.barrier()
capture_stream.synchronize()
# Benchmark
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record(capture_stream)
with torch.cuda.stream(capture_stream):
for _ in range(n_graph_launches):
g.replay()
end_event.record(capture_stream)
end_event.synchronize()
# Get elapsed time in milliseconds
elapsed_ms = start_event.elapsed_time(end_event)
avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
time_us = avg_time_ms * 1000
alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
if self.rank == 0:
print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
self._barrier_internal()
def destroy(self):
self._algorithm_nvls_nonzero_copy = None
self._algorithm_nvls_packet = None
self.scratch_buffer = None
self.comm = None
self._algos.clear()
self._tune_cache = {"allreduce": {}, "allgather": {}}
self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
def init_dist() -> CustomizedComm:
rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
master_port = os.environ["MSCCLPP_MASTER_PORT"]
interface = interfaces_for_ip_netifaces(master_addr)
if interface is None:
raise ValueError(f"Cannot find network interface for IP address {master_addr}")
interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
return CustomizedComm(mscclpp_group)
# -- Benchmarks (standalone) --------------------------------------------------
def _bench_sizes(low=5 * 1024, high=80 << 20):
sizes, c = [], low
while c <= high:
sizes.append(c)
c *= 2
return sizes
def benchmark_allreduce(
comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
):
sizes = _bench_sizes()
if comm.rank == 0:
print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
cs = torch.cuda.Stream()
buf = _make_tensor(1 << 27, dtype)
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
for size in sizes:
nelems = size // buf.element_size()
t = buf[: size // buf.element_size()]
comm.all_reduce(t, accum_dtype=accum_dtype)
torch.cuda.synchronize()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(n_iter):
comm.all_reduce(t, accum_dtype=accum_dtype)
with torch.cuda.stream(cs):
for _ in range(n_warmup):
g.replay()
comm.barrier()
cs.synchronize()
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
s.record(cs)
with torch.cuda.stream(cs):
for _ in range(n_graph_launches):
g.replay()
e.record(cs)
e.synchronize()
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
if comm.rank == 0:
print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
sizes = _bench_sizes()
if comm.rank == 0:
print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
cs = torch.cuda.Stream()
buf = _make_tensor(1 << 27, dtype)
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
for prs in sizes:
total = prs * comm.world_size
if total > buf.nbytes:
break
nt = total // buf.element_size()
npr = prs // buf.element_size()
out = buf[:nt]
inp = out[comm.rank * npr : (comm.rank + 1) * npr]
comm.all_gather(out, inp)
torch.cuda.synchronize()
cs.wait_stream(torch.cuda.current_stream())
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g, stream=cs):
for _ in range(n_iter):
comm.all_gather(out, inp)
with torch.cuda.stream(cs):
for _ in range(n_warmup):
g.replay()
comm.barrier()
cs.synchronize()
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
s.record(cs)
with torch.cuda.stream(cs):
for _ in range(n_graph_launches):
g.replay()
e.record(cs)
e.synchronize()
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
if comm.rank == 0:
print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
# -- Bootstrap & main ---------------------------------------------------------
def init_dist() -> mscclpp.CommGroup:
addr = os.environ.get("MSCCLPP_MASTER_ADDR")
if addr:
rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
port = os.environ["MSCCLPP_MASTER_PORT"]
iface = _interfaces_for_ip(addr)
if not iface:
raise ValueError(f"No interface for {addr}")
return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
import torch.distributed as dist
dist.init_process_group(backend="gloo")
return mscclpp.CommGroup(torch_group=dist.group.WORLD)
def main():
local = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local)
comm = init_dist()
comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
comm.barrier()
dtype_str = os.environ.get("DTYPE", "float16")
dtype = getattr(torch, dtype_str, torch.float16)
accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
accum_str = os.environ.get("ACCUM_DTYPE")
accum_dtype = accum_map.get(accum_str) if accum_str else None
comm_group = init_dist()
cc = CustomizedComm(comm_group)
print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
cc.barrier()
torch.cuda.synchronize()
comm.destroy()
print(f"rank {local} All-reduce operation completed successfully.")
benchmark_allgather(cc, dtype=dtype)
cc.barrier()
torch.cuda.synchronize()
cc.destroy()
print(f"rank {local} completed successfully.")
if __name__ == "__main__":

View File

@@ -1,19 +1,20 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
import os
from typing import Any, Dict
import torch, torch.distributed as dist
import mscclpp
import mscclpp.ext
from mscclpp.language.collectives import AllReduce
from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
from mscclpp.language.program import CollectiveProgram
from mscclpp.language.rank import Rank
from mscclpp.language.utils import AlgoSpec
def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
gpu_size = spec.world_size
with CollectiveProgram.from_spec(spec) as program:
# Creating Channels
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
return program
def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
spec = mscclpp.AlgoSpec(
def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
spec = AlgoSpec(
name="allreduce_nvls",
collective=AllReduce(8, 1, True),
nranks_per_node=8,
@@ -94,10 +95,10 @@ def init_dist():
rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
local = int(os.environ["LOCAL_RANK"])
algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
setup_plan(algorithm_collection_builder, rank, world)
algorithm_collection_builder.set_algorithm_selector(selector)
dist.init_process_group(backend="nccl", device_id=local)
dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
return rank, world, local

View File

@@ -103,12 +103,14 @@ class Algorithm {
/// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
/// @param symmetricMemory Whether to use symmetric memory optimization.
/// @param extras Additional parameters for algorithm-specific customization.
/// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
/// @return The result of the operation.
virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) = 0;
/// Reset the algorithm state, clearing any cached contexts.
virtual void reset() = 0;
@@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
/// @param nBlocks Number of CUDA blocks.
/// @param nThreadsPerBlock Number of threads per block.
/// @param extras Additional algorithm-specific parameters.
/// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
/// @return The result of the operation.
using KernelFunc =
std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
/// Function type for creating algorithm contexts.
/// @param comm The communicator.
@@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm {
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) override;
const std::string& name() const override;
const std::string& collective() const override;
const std::pair<size_t, size_t>& messageRange() const override;
@@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
bool symmetricMemory = false,
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
DataType accumDtype = DataType::AUTO) override;
AlgorithmType type() const override { return AlgorithmType::DSL; }
Constraint constraint() const override;
void reset() override;

View File

@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
}
#elif defined(MSCCLPP_DEVICE_HIP)
#else // !defined(MSCCLPP_DEVICE_CUDA)
constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
// HIP does not have thread scope enums like CUDA
constexpr auto scopeSystem = 0;
constexpr auto scopeDevice = 0;
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
return __atomic_fetch_add(ptr, val, memoryOrder);
}
#endif // defined(MSCCLPP_DEVICE_HIP)
#endif // !defined(MSCCLPP_DEVICE_CUDA)
} // namespace mscclpp

View File

@@ -390,7 +390,7 @@ struct EndpointConfig {
};
static constexpr int DefaultPort = -1;
static constexpr int DefaultGidIndex = 0;
static constexpr int DefaultGidIndex = -1;
static constexpr int DefaultMaxCqSize = 1024;
static constexpr int DefaultMaxCqPollNum = 1;
static constexpr int DefaultMaxSendWr = 8192;
@@ -419,7 +419,7 @@ struct EndpointConfig {
/// Constructor.
/// @param deviceIndex Device index.
/// @param port Port number.
/// @param gidIndex GID index.
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
/// @param maxCqSize Maximum send completion queue size.
/// @param maxCqPollNum Maximum send completion queue poll count.
/// @param maxSendWr Maximum outstanding send work requests.

View File

@@ -110,6 +110,11 @@ class Env {
/// Default is false.
const bool forceDisableNvls;
/// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
/// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
/// Default is false.
const bool forceDisableGdr;
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
/// When set to a non-negative value, overrides the `gidIndex` parameter passed to `EndpointConfig::Ib`.
/// Default is -1 (unset, uses the constructor argument which defaults to `EndpointConfig::Ib::DefaultGidIndex`).

View File

@@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162;
#endif
/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
struct alignas(1) __fp8_e4m3b15 {
uint8_t __x;
__fp8_e4m3b15() = default;
/// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
/// Construct from float32 (explicit to avoid ambiguous conversion chains).
MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
/// Convert to float32.
MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
/// Construct from a raw bit pattern without conversion.
static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
__fp8_e4m3b15 r;
r.__x = bits;
return r;
}
private:
/// Decode fp8_e4m3b15 bits → float32.
///
/// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
/// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
/// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
/// then convert fp16 → float32.
static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
// Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
uint32_t exp = (bits >> 3) & 0xFu;
if (bits == 0x80 || exp == 15) {
union {
uint32_t u;
float f;
} nan_val = {0x7FC00000u};
return nan_val.f;
}
if (bits == 0) return 0.0f;
// Triton-style bit manipulation: fp8 → fp16 → fp32.
// fp8 layout: [S:1][E:4][M:3] (bias=15)
// fp16 layout: [S:1][E:5][M:10] (bias=15)
//
// Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
// to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
// Refer:
// https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16
uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position
uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign)
uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1
// For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
// and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
// the subnormal fp16 value since both share bias=15.
// Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
union {
uint16_t u;
__half h;
} cvt = {fp16_bits};
return __half2float(cvt.h);
}
/// Encode float32 → fp8_e4m3b15 bits.
///
/// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
/// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
/// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
union {
float f;
uint32_t u;
} in = {val};
// NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
// Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
__half h_val = __float2half_rn(val);
union {
__half h;
uint16_t u;
} cvt = {h_val};
uint16_t fp16_bits = cvt.u;
// Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
// Reconstruct with sign.
uint16_t sign16 = fp16_bits & 0x8000u;
// Triton-style: fp16 → fp8.
// fp16 layout: [S:1][E:5][M:10] (bias=15)
// fp8 layout: [S:1][E:4][M:3] (bias=15)
//
// mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080)
// This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
// Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0
// Finally: prmt for byte extraction.
//
// Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
// The upper byte now contains [E:4][M:3][round_bit].
// Combine with sign and extract.
uint16_t with_sign = sign16 | adjusted;
uint8_t result = (uint8_t)(with_sign >> 8);
// Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
if ((result & 0x7Fu) == 0) result = 0x00u;
return result;
}
};
/// Packed 2x fp8_e4m3b15 storage.
struct alignas(2) __fp8x2_e4m3b15 {
uint16_t __x;
};
/// Packed 4x fp8_e4m3b15 storage.
struct alignas(4) __fp8x4_e4m3b15 {
uint32_t __x;
};
namespace mscclpp {
/// Data types supported by mscclpp operations.
enum class DataType {
INT32, // 32-bit signed integer.
UINT32, // 32-bit unsigned integer.
FLOAT16, // IEEE 754 half precision.
FLOAT32, // IEEE 754 single precision.
BFLOAT16, // bfloat16 precision.
FLOAT8_E4M3, // float8 with E4M3 layout.
FLOAT8_E5M2, // float8 with E5M2 layout.
UINT8, // 8-bit unsigned integer.
INT32, // 32-bit signed integer.
UINT32, // 32-bit unsigned integer.
FLOAT16, // IEEE 754 half precision.
FLOAT32, // IEEE 754 single precision.
BFLOAT16, // bfloat16 precision.
FLOAT8_E4M3, // float8 with E4M3 layout.
FLOAT8_E5M2, // float8 with E5M2 layout.
UINT8, // 8-bit unsigned integer.
FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel).
AUTO = 255, // Sentinel: resolve to the input dtype at runtime.
};
/// Word array.
@@ -97,6 +230,7 @@ struct alignas(Bytes) Words<Bytes, false> {};
template <typename T, int N, typename StorageT>
union alignas(sizeof(T) * N) VectorTypeImpl {
static_assert(N > 0, "N must be greater than 0");
static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
T data[N];
Words<sizeof(T) * N> words;
@@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl {
MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
};
// Helper template to get the appropriate vector type for a given element type and count
// Helper template to get the appropriate vector type for a given element type and count.
template <typename T, int N>
struct VectorTypeHelper {
using type =
VectorTypeImpl<T, N,
typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
static constexpr int Bytes = N * sizeof(T);
using type = VectorTypeImpl<
T, N,
std::conditional_t<Bytes == 4, uint32_t,
std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
};
/// Vector type - clean user interface (automatically selects appropriate storage type)
@@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
DEFINE_VEC(f16x8, __half, 8, uint4);
DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
using f32x8 = VectorType<float, 8>;
using f32x16 = VectorType<float, 16>;
using f16x16 = VectorType<__half, 16>;
#if defined(__FP8_TYPES_EXIST__)
DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
@@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
#endif
// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
#undef DEFINE_VEC
#if defined(MSCCLPP_DEVICE_COMPILE)
@@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
}
#endif
// --- f32x2 arithmetic ---
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
// Blackwell (SM 10.0+): packed float2 add in a single instruction.
return __fadd2_rn(a.storage, b.storage);
#else
f32x2 result;
result.data[0] = a.data[0] + b.data[0];
result.data[1] = a.data[1] + b.data[1];
return result;
#endif
}
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
__half2 result;
@@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
return result;
}
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
// Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
f16x4 result;
f16x2* r2 = reinterpret_cast<f16x2*>(&result);
r2[0] = a2[0] + b2[0];
r2[1] = a2[1] + b2[1];
return result;
}
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
__bfloat162 result;
@@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
return (a < b ? a : b);
}
template <>
MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
f32x2 result;
result.data[0] = fminf(a.data[0], b.data[0]);
result.data[1] = fminf(a.data[1], b.data[1]);
return result;
}
template <>
MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
#if defined(MSCCLPP_DEVICE_HIP)
@@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
#endif
}
/// Convert a vector type From to vector type To.
/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
template <typename To, typename From>
MSCCLPP_DEVICE_INLINE To to(const From& v) {
static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
constexpr int N = From::Size;
// Auto-decompose: N > 4 → split into x4 chunks
if constexpr (N > 4 && N % 4 == 0) {
constexpr int nChunks = N / 4;
using FromChunk = VectorType<typename From::ElementType, 4>;
using ToChunk = VectorType<typename To::ElementType, 4>;
const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
To result;
ToChunk* out = reinterpret_cast<ToChunk*>(&result);
#pragma unroll
for (int c = 0; c < nChunks; ++c) {
out[c] = to<ToChunk>(in[c]);
}
return result;
}
// Auto-decompose: N == 4 → split into 2x x2 chunks
else if constexpr (N == 4) {
using FromChunk = VectorType<typename From::ElementType, 2>;
using ToChunk = VectorType<typename To::ElementType, 2>;
const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
To result;
ToChunk* out = reinterpret_cast<ToChunk*>(&result);
out[0] = to<ToChunk>(in[0]);
out[1] = to<ToChunk>(in[1]);
return result;
}
// Base case: element-wise conversion
else {
To result;
#pragma unroll
for (int i = 0; i < N; ++i) {
result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
}
return result;
}
}
#if defined(__FP8_TYPES_EXIST__)
template <>
MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
@@ -551,7 +777,592 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
return result;
}
// --- f8_e4m3 -> f32 specializations ---
/// f8_e4m3x2 -> f32x2.
/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
template <>
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
f32x2 result;
result.data[0] = f[0];
result.data[1] = f[1];
return result;
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
f32x2 result;
result.data[0] = __half2float(bit_cast<__half>(h2.x));
result.data[1] = __half2float(bit_cast<__half>(h2.y));
return result;
#else
f32x2 result;
result.data[0] = float(v.data[0]);
result.data[1] = float(v.data[1]);
return result;
#endif
}
/// f8_e4m3x4 -> f32x4.
template <>
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
f32x4 result;
result.data[0] = lo[0];
result.data[1] = lo[1];
result.data[2] = hi[0];
result.data[3] = hi[1];
return result;
#else
const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
f32x2 lo = to<f32x2>(pair[0]);
f32x2 hi = to<f32x2>(pair[1]);
f32x4 result;
result.data[0] = lo.data[0];
result.data[1] = lo.data[1];
result.data[2] = hi.data[0];
result.data[3] = hi.data[1];
return result;
#endif
}
// --- f8_e5m2 -> f32 specializations ---
/// f8_e5m2x2 -> f32x2.
/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
template <>
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
f32x2 result;
result.data[0] = f[0];
result.data[1] = f[1];
return result;
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
f32x2 result;
result.data[0] = __half2float(bit_cast<__half>(h2.x));
result.data[1] = __half2float(bit_cast<__half>(h2.y));
return result;
#else
f32x2 result;
result.data[0] = float(v.data[0]);
result.data[1] = float(v.data[1]);
return result;
#endif
}
/// f8_e5m2x4 -> f32x4.
template <>
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
f32x4 result;
result.data[0] = lo[0];
result.data[1] = lo[1];
result.data[2] = hi[0];
result.data[3] = hi[1];
return result;
#else
const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
f32x2 lo = to<f32x2>(pair[0]);
f32x2 hi = to<f32x2>(pair[1]);
f32x4 result;
result.data[0] = lo.data[0];
result.data[1] = lo.data[1];
result.data[2] = hi.data[0];
result.data[3] = hi.data[1];
return result;
#endif
}
// --- f32 -> f8_e4m3 specializations (downcast) ---
/// f32x2 -> f8_e4m3x2.
/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2;
h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
return bit_cast<f8_e4m3x2>(fp8x2);
#elif defined(MSCCLPP_DEVICE_CUDA)
__half_raw h0, h1;
h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
f8_e4m3x2 result;
result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
return result;
#else
f8_e4m3x2 result;
result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
return result;
#endif
}
/// f32x4 -> f8_e4m3x4.
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
return bit_cast<f8_e4m3x4>(packed);
#else
f32x2 lo, hi;
lo.data[0] = v.data[0];
lo.data[1] = v.data[1];
hi.data[0] = v.data[2];
hi.data[1] = v.data[3];
f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
f8_e4m3x4 result;
result.data[0] = lo_fp8.data[0];
result.data[1] = lo_fp8.data[1];
result.data[2] = hi_fp8.data[0];
result.data[3] = hi_fp8.data[1];
return result;
#endif
}
// --- f32 -> f8_e5m2 specializations (downcast) ---
/// f32x2 -> f8_e5m2x2.
/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
template <>
MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2;
h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
return bit_cast<f8_e5m2x2>(fp8x2);
#elif defined(MSCCLPP_DEVICE_CUDA)
__half_raw h0, h1;
h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
f8_e5m2x2 result;
result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
return result;
#else
f8_e5m2x2 result;
result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
return result;
#endif
}
/// f32x4 -> f8_e5m2x4.
template <>
MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
return bit_cast<f8_e5m2x4>(packed);
#else
f32x2 lo, hi;
lo.data[0] = v.data[0];
lo.data[1] = v.data[1];
hi.data[0] = v.data[2];
hi.data[1] = v.data[3];
f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
f8_e5m2x4 result;
result.data[0] = lo_fp8.data[0];
result.data[1] = lo_fp8.data[1];
result.data[2] = hi_fp8.data[0];
result.data[3] = hi_fp8.data[1];
return result;
#endif
}
// --- f8_e4m3 <-> f16 conversion specializations ---
/// f8_e4m3x2 -> f16x2.
/// NVIDIA SM90+: packed intrinsic (1 instruction).
/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
/// Pre-SM90 / fallback: element-wise scalar conversion.
template <>
MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
f16x2 result;
result.data[0] = __float2half(f[0]);
result.data[1] = __float2half(f[1]);
return result;
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
return bit_cast<f16x2>(h2);
#else
f16x2 result;
result.data[0] = static_cast<__half>(v.data[0]);
result.data[1] = static_cast<__half>(v.data[1]);
return result;
#endif
}
/// f16x2 -> f8_e4m3x2.
/// NVIDIA SM90+: packed intrinsic (1 instruction).
/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
/// Pre-SM90: element-wise scalar conversion.
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
float f0 = __half2float(v.data[0]);
float f1 = __half2float(v.data[1]);
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
__half2_raw h2 = bit_cast<__half2_raw>(v);
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
return bit_cast<f8_e4m3x2>(fp8x2);
#elif defined(MSCCLPP_DEVICE_CUDA)
__half_raw h0, h1;
h0.x = bit_cast<unsigned short>(v.data[0]);
h1.x = bit_cast<unsigned short>(v.data[1]);
f8_e4m3x2 result;
result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
return result;
#else
f8_e4m3x2 result;
result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
return result;
#endif
}
#endif // defined(__FP8_TYPES_EXIST__)
// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
// vs fp8's 3. fp32 conversions are derived by routing through fp16.
/// f8_e4m3b15x2 -> f16x2.
/// Direct fp8 -> fp16 via branch-free bit manipulation.
template <>
MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
uint16_t in = v.storage.__x;
// Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
uint32_t out0 = b0 | (a0 & 0x80008000u);
__half2 h;
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
return h;
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
uint16_t in = v.storage.__x;
uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
uint32_t out0 = b0 | (a0 & 0x80008000u);
f16x2 result;
result.words[0] = out0;
return result;
#else
f16x2 result;
result.data[0] = __float2half(float(v.data[0]));
result.data[1] = __float2half(float(v.data[1]));
return result;
#endif
}
/// f8_e4m3b15x4 -> f16x4.
/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
template <>
MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
uint32_t in = v.storage.__x;
uint32_t a0 = __byte_perm(0u, in, 0x5746u);
uint32_t a0_shr = a0 >> 1;
uint32_t a0_sign = a0 & 0x80008000u;
uint32_t out0;
asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
uint32_t a1_shr = a1 >> 1;
uint32_t a1_sign = a1 & 0x80008000u;
uint32_t out1;
asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
f16x4 result;
asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
return result;
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
uint32_t in = v.storage.__x;
uint32_t a0 = __byte_perm(0u, in, 0x5746u);
uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
f16x4 result;
result.words[0] = out0;
result.words[1] = out1;
return result;
#else
f16x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = __float2half(float(v.data[i]));
}
return result;
#endif
}
/// f16x2 -> f8_e4m3b15x2.
/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
uint32_t in0;
asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
// Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
alo = alo < 0x3B80u ? alo : 0x3B80u;
ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
uint32_t a0 = alo | (ahi << 16);
a0 = a0 * 2u + 0x00800080u;
uint32_t b0 = a0 | (in0 & 0x80008000u);
uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
return bit_cast<f8_e4m3b15x2>(packed);
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
uint32_t in0 = v.words[0];
uint32_t abs0 = in0 & 0x7fff7fffu;
uint32_t a0;
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
a0 = a0 * 2u + 0x00800080u;
uint32_t b0 = a0 | (in0 & 0x80008000u);
uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
return bit_cast<f8_e4m3b15x2>(packed);
#else
f8_e4m3b15x2 result;
result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
return result;
#endif
}
/// f16x4 -> f8_e4m3b15x4.
/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
uint32_t in0, in1;
asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
uint32_t abs0 = in0 & 0x7fff7fffu;
uint32_t abs1 = in1 & 0x7fff7fffu;
uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
a0 = a0 * 2u + 0x00800080u;
a1 = a1 * 2u + 0x00800080u;
uint32_t b0, b1;
asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
uint32_t packed = __byte_perm(b0, b1, 0x7531u);
return bit_cast<f8_e4m3b15x4>(packed);
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
// gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
uint32_t in0 = v.words[0], in1 = v.words[1];
uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
uint32_t a0, a1;
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
a0 = a0 * 2u + 0x00800080u;
a1 = a1 * 2u + 0x00800080u;
uint32_t b0 = a0 | (in0 & 0x80008000u);
uint32_t b1 = a1 | (in1 & 0x80008000u);
uint32_t packed = __byte_perm(b0, b1, 0x7531u);
return bit_cast<f8_e4m3b15x4>(packed);
#else
f8_e4m3b15x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
}
return result;
#endif
}
// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
/// f8_e4m3b15x2 -> f32x2.
/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
template <>
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
float2 f2 = __half22float2(h);
return bit_cast<f32x2>(f2);
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
f32x2 result;
result.data[0] = __half2float(h.data[0]);
result.data[1] = __half2float(h.data[1]);
return result;
#else
f32x2 result;
result.data[0] = float(v.data[0]);
result.data[1] = float(v.data[1]);
return result;
#endif
}
/// f8_e4m3b15x4 -> f32x4.
/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
template <>
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
__half2 h0, h1;
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
float2 f0 = __half22float2(h0);
float2 f1 = __half22float2(h1);
f32x4 result;
result.data[0] = f0.x;
result.data[1] = f0.y;
result.data[2] = f1.x;
result.data[3] = f1.y;
return result;
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
f32x4 result;
result.data[0] = __half2float(h.data[0]);
result.data[1] = __half2float(h.data[1]);
result.data[2] = __half2float(h.data[2]);
result.data[3] = __half2float(h.data[3]);
return result;
#else
f32x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = float(v.data[i]);
}
return result;
#endif
}
/// f32x2 -> f8_e4m3b15x2.
/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
float2 f2 = {v.data[0], v.data[1]};
__half2 h = __float22half2_rn(f2);
return to<f8_e4m3b15x2, f16x2>(h);
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
f16x2 h;
h.data[0] = __float2half_rn(v.data[0]);
h.data[1] = __float2half_rn(v.data[1]);
return to<f8_e4m3b15x2, f16x2>(h);
#else
f8_e4m3b15x2 result;
result.data[0] = __fp8_e4m3b15(v.data[0]);
result.data[1] = __fp8_e4m3b15(v.data[1]);
return result;
#endif
}
/// f32x4 -> f8_e4m3b15x4.
/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
template <>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
#if defined(MSCCLPP_DEVICE_CUDA)
float2 f01 = {v.data[0], v.data[1]};
float2 f23 = {v.data[2], v.data[3]};
__half2 h01 = __float22half2_rn(f01);
__half2 h23 = __float22half2_rn(f23);
f16x4 h;
asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
return to<f8_e4m3b15x4, f16x4>(h);
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
f16x4 h;
h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
return to<f8_e4m3b15x4, f16x4>(h);
#else
f8_e4m3b15x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = __fp8_e4m3b15(v.data[i]);
}
return result;
#endif
}
// --- fp8_e4m3b15 arithmetic (software, always available) ---
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
return __fp8_e4m3b15(float(a) + float(b));
}
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
f8_e4m3b15x2 result;
result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
return result;
}
template <bool UseClip = true>
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
f8_e4m3b15x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
}
return result;
}
// --- fp8_e4m3b15 min (software) ---
template <>
MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
return __fp8_e4m3b15(fminf(float(a), float(b)));
}
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
f8_e4m3b15x2 result;
result.data[0] = mscclpp::min(a.data[0], b.data[0]);
result.data[1] = mscclpp::min(a.data[1], b.data[1]);
return result;
}
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
f8_e4m3b15x4 result;
#pragma unroll
for (int i = 0; i < 4; ++i) {
result.data[i] = mscclpp::min(a.data[i], b.data[i]);
}
return result;
}
#endif // MSCCLPP_DEVICE_COMPILE
} // namespace mscclpp

View File

@@ -16,6 +16,7 @@ namespace mscclpp {
class Host2DeviceSemaphore {
private:
Semaphore semaphore_;
std::shared_ptr<uint64_t> inboundToken_;
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
std::unique_ptr<uint64_t> outboundToken_;
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
/// @param connection The connection associated with this semaphore.
Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
/// Destructor.
~Host2DeviceSemaphore();
/// Move constructor.
Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
/// Move assignment operator.
Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
/// Returns the connection.
/// @return The connection associated with this semaphore.
Connection& connection();
@@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore {
private:
Semaphore semaphore_;
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
detail::UniqueGpuPtr<uint64_t> outboundToken_;
public:
/// Constructor.

View File

@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
/// Signal remote device, ensures prior memory ops complete.
MSCCLPP_DEVICE_INLINE void signal() {
auto outbound = incOutbound();
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
// Using memoryOrderSeqCst is faster for A100.
atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
#else
atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
#if defined(MSCCLPP_DEVICE_CUDA)
asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
#elif defined(MSCCLPP_DEVICE_HIP)
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
#endif
}
/// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
MSCCLPP_DEVICE_INLINE void relaxedSignal() {
auto outbound = incOutbound();
atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
#if defined(MSCCLPP_DEVICE_CUDA)
asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
#elif defined(MSCCLPP_DEVICE_HIP)
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
#endif
}
/// Thread-safe read of expected inbound value.
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
}
/// Thread-safe read of outbound value.
/// @return The outbound value.
MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
}
/// Thread-safe increment of outbound value.
/// @return The incremented outbound value.
MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
}
#endif // defined(MSCCLPP_DEVICE_COMPILE)
/// A local memory space where the remote device will write its semaphore value and the local device will read it.
uint64_t* inboundToken;
/// A local memory space where the local device stores the semaphore value to be written to the remote device.
uint64_t* outboundToken;
/// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
/// remote device.
/// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
uint64_t* remoteInboundToken;
/// A local memory space where the local device stores the expected value of the inboundToken to wait for.

View File

@@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build"
name = "mscclpp"
dynamic = ["version"]
description = "MSCCL++ Python API"
requires-python = ">=3.8"
requires-python = ">=3.10"
dependencies = [
"numpy",
"blake3",
"pybind11",
"sortedcontainers",
]
[project.optional-dependencies]
cuda11 = ["cupy-cuda11x"]
cuda12 = ["cupy-cuda12x"]
cuda13 = ["cupy-cuda13x"]
rocm6 = ["cupy"]
benchmark = [
"mpi4py",
"prettytable",
"netifaces",
"matplotlib",
]
test = [
"pytest",
"mpi4py",
"netifaces",
]
[tool.setuptools_scm]
write_to = "python/mscclpp/_version.py"
@@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF"
[tool.black]
line-length = 120
target-version = ['py38']
target-version = ['py310']
include = '\.pyi?$'

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
include(FetchContent)
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
FetchContent_MakeAvailable(nanobind)
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
if(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
endif()
install(TARGETS mscclpp_py LIBRARY DESTINATION .)

View File

@@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) {
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
std::unordered_map<std::string, uintptr_t> extras) {
std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
nBlocks, nThreadsPerBlock, symmetricMemory, extras);
nBlocks, nThreadsPerBlock, symmetricMemory, extras,
static_cast<DataType>(accumDtype));
},
nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
.def("reset", &Algorithm::reset);
nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")

View File

@@ -47,7 +47,8 @@ void register_core(nb::module_& m) {
.value("bfloat16", DataType::BFLOAT16)
.value("float8_e4m3", DataType::FLOAT8_E4M3)
.value("float8_e5m2", DataType::FLOAT8_E5M2)
.value("uint8", DataType::UINT8);
.value("uint8", DataType::UINT8)
.value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
nb::class_<Bootstrap>(m, "CppBootstrap")
.def("get_rank", &Bootstrap::getRank)

View File

@@ -28,6 +28,7 @@ void register_env(nb::module_& m) {
.def_ro("force_nccl_fallback_operation", &Env::forceNcclFallbackOperation)
.def_ro("nccl_symmetric_memory", &Env::ncclSymmetricMemory)
.def_ro("force_disable_nvls", &Env::forceDisableNvls)
.def_ro("force_disable_gdr", &Env::forceDisableGdr)
.def_ro("ib_gid_index", &Env::ibGidIndex);
m.def("env", &env);

View File

@@ -4,6 +4,7 @@
#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/vector.h>

View File

@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
return DLDataType{kDLBfloat, 16, 1};
} else if (type == "torch.float16") {
return DLDataType{kDLFloat, 16, 1};
} else if (type == "torch.float8_e4m3fn") {
return DLDataType{kDLFloat8_e4m3fn, 8, 1};
} else if (type == "torch.float8_e4m3fnuz") {
return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
} else if (type == "torch.float8_e5m2") {
return DLDataType{kDLFloat8_e5m2, 8, 1};
} else if (type == "torch.float8_e5m2fnuz") {
return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
} else if (type == "torch.uint8") {
return DLDataType{kDLUInt, 8, 1};
} else if (type == "fp8_e4m3b15") {
// No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
return DLDataType{kDLUInt, 8, 1};
} else {
throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
}

View File

@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
.def(nb::init<>())
.def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
.def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
.def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
.def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
.def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {

View File

@@ -57,7 +57,7 @@ default_algo_configs = [
def create_default_plans():
plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default")
plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
plan_path = Path(plan_dir)
if plan_path.exists():
shutil.rmtree(plan_path)

View File

@@ -177,6 +177,7 @@ class Algorithm:
nthreads_per_block=0,
symmetric_memory: bool = False,
extras: Optional[Dict[str, int]] = None,
accum_dtype: Optional[CppDataType] = None,
) -> int:
"""Execute the collective algorithm.
@@ -194,10 +195,14 @@ class Algorithm:
nthreads_per_block: Number of threads per block (0 for auto-selection).
symmetric_memory: Whether to use symmetric memory optimization (default: False).
extras: Additional algorithm-specific parameters.
accum_dtype: Data type for accumulation during reduction. If None, defaults to
the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
Returns:
The result code (0 for success).
"""
merged_extras = dict(extras) if extras is not None else {}
accum_dtype = accum_dtype if accum_dtype is not None else dtype
return self._algorithm.execute(
comm,
int(input_buffer),
@@ -211,7 +216,8 @@ class Algorithm:
nblocks,
nthreads_per_block,
symmetric_memory,
extras if extras is not None else {},
merged_extras,
int(accum_dtype),
)
def reset(self):

View File

@@ -192,6 +192,9 @@ class NativeCodeCompiler:
"""
def __init__(self):
self._initialized = False
def _do_init(self):
self._is_hip = cp.cuda.runtime.is_hip
self._device_arch = get_device_arch()
self._compiler = self._get_compiler()
@@ -226,6 +229,7 @@ class NativeCodeCompiler:
]
self._cache_dir = Path(env().cache_dir) / "native"
self._cache_dir.mkdir(parents=True, exist_ok=True)
self._initialized = True
def _get_compiler(self) -> str:
"""Get the path to the appropriate compiler.
@@ -246,6 +250,8 @@ class NativeCodeCompiler:
Returns:
str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
"""
if not self._initialized:
self._do_init()
return self._device_arch
def __call__(self, name: str, file: str, **kwds):
@@ -290,6 +296,8 @@ class NativeCodeCompiler:
>>> # Use the module to create an algorithm
>>> algo = module.create_allreduce_algorithm(comm, buffer, size)
"""
if not self._initialized:
self._do_init()
if not os.path.isfile(file):
raise FileNotFoundError(f"The specified source file does not exist: {file}")

View File

@@ -140,7 +140,7 @@ class MemoryChannel:
for tb_id in tb_list:
tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
tb_channel_ids = get_program().setup_channel(tb, self)
tb_channel_ids = get_program().setup_channel(tb_id, self)
op = GetOperation(
src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],

View File

@@ -745,7 +745,7 @@ class ReduceOperation(BaseOperation):
remote_dst_buff=self.remote_dst_buff + other.dst_buff,
channel_ids=self.channel_ids,
put_channel_ids=self.put_channel_ids + other.channel_ids,
channel_type=self.channel_type,
channel_type=other.channel_type,
reduce_operation=self.reduce_operation,
tbg_info=self.tbg_info,
packet=self.packet,

View File

@@ -5,6 +5,6 @@ netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
sortedcontainers
blake3
pybind11

View File

@@ -5,6 +5,6 @@ netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
sortedcontainers
blake3
pybind11

View File

@@ -5,6 +5,6 @@ netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
sortedcontainers
blake3
pybind11

View File

@@ -1,10 +1,10 @@
mpi4py==4.1.1
cupy==13.6.0
mpi4py
cupy
prettytable
netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
sortedcontainers
blake3
pybind11

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
include(FetchContent)
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
FetchContent_MakeAvailable(nanobind)

View File

@@ -0,0 +1,397 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Correctness test for FP8 allreduce with different accumulation types.
#
# Verifies that FP8 allreduce with higher-precision accumulation produces
# results at least as accurate as native FP8 accumulation, by comparing
# against a float32 reference.
#
# Usage:
# mpirun -np 8 pytest python/test/test_fp8_accum.py -v
import cupy as cp
import numpy as np
import pytest
from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
from mscclpp.ext import AlgorithmCollectionBuilder
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
# ---------------------------------------------------------------------------
# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
# ---------------------------------------------------------------------------
def e4m3fn_to_float(uint8_array):
"""Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
bits = uint8_array.astype(cp.int32)
sign = (bits >> 7) & 1
exp = (bits >> 3) & 0xF
mant = bits & 0x7
# Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
# Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
result = cp.where(exp == 0, subnormal_val, normal_val)
result = cp.where(sign == 1, -result, result)
# Zero
result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
# NaN: exp==15 & mant==7
nan_mask = (exp == 15) & (mant == 7)
result = cp.where(nan_mask, cp.float32(float("nan")), result)
return result
def float_to_e4m3fn(f32_array, chunk_size=65536):
"""Encode a cupy float32 array to uint8 E4M3FN bit patterns.
Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
then find nearest match per element via chunked broadcast comparison.
"""
# Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
all_bytes = cp.arange(128, dtype=cp.uint8)
all_floats = e4m3fn_to_float(all_bytes) # (128,) float32
# Mark NaN entries as inf so they're never selected as nearest
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
# Clamp input and extract sign
clamped = f32_array.astype(cp.float32)
clamped = cp.clip(clamped, -448.0, 448.0)
signs = (clamped < 0).astype(cp.uint8)
absval = cp.abs(clamped)
result = cp.zeros(absval.shape, dtype=cp.uint8)
n = absval.size
absval_flat = absval.ravel()
result_flat = result.ravel()
for start in range(0, n, chunk_size):
end = min(start + chunk_size, n)
chunk = absval_flat[start:end]
# (chunk_size, 128) difference matrix
diffs = cp.abs(chunk[:, None] - all_floats[None, :])
result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
# Combine with sign bit
result = result_flat.reshape(absval.shape)
result = result | (signs << 7)
# Handle exact zero
result = cp.where(absval == 0, cp.uint8(0), result)
return result
# ---------------------------------------------------------------------------
# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
# ---------------------------------------------------------------------------
def e4m3b15_to_float(uint8_array):
"""Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
bits = uint8_array.astype(cp.int32)
sign = (bits >> 7) & 1
exp = (bits >> 3) & 0xF
mant = bits & 0x7
# Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
# Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
result = cp.where(exp == 0, subnormal_val, normal_val)
result = cp.where(sign == 1, -result, result)
# Zero
result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
# NaN: exp==15 or negative zero (0x80)
nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
result = cp.where(nan_mask, cp.float32(float("nan")), result)
return result
def float_to_e4m3b15(f32_array, chunk_size=65536):
"""Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
Same lookup-table approach as float_to_e4m3fn.
"""
# Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
all_bytes = cp.arange(128, dtype=cp.uint8)
all_floats = e4m3b15_to_float(all_bytes) # (128,) float32
# Mark NaN entries as inf so they're never selected as nearest
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
# Clamp input and extract sign
clamped = f32_array.astype(cp.float32)
clamped = cp.clip(clamped, -0.9375, 0.9375)
signs = (clamped < 0).astype(cp.uint8)
absval = cp.abs(clamped)
result = cp.zeros(absval.shape, dtype=cp.uint8)
n = absval.size
absval_flat = absval.ravel()
result_flat = result.ravel()
for start in range(0, n, chunk_size):
end = min(start + chunk_size, n)
chunk = absval_flat[start:end]
# (chunk_size, 128) difference matrix
diffs = cp.abs(chunk[:, None] - all_floats[None, :])
result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
# Combine with sign bit
result = result_flat.reshape(absval.shape)
result = result | (signs << 7)
# Handle exact zero
result = cp.where(absval == 0, cp.uint8(0), result)
return result
# ---------------------------------------------------------------------------
# Shared test helpers
# ---------------------------------------------------------------------------
def setup_algorithms(mpi_group):
"""Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
comm_group = CommGroup(mpi_group.comm)
scratch = GpuBuffer(1 << 27, dtype=cp.uint8) # 128 MB
AlgorithmCollectionBuilder.reset()
builder = AlgorithmCollectionBuilder()
algorithms = builder.build_default_algorithms(
scratch_buffer=scratch.data.ptr,
scratch_buffer_size=scratch.nbytes,
rank=comm_group.my_rank,
)
algo_map = {a.name: a for a in algorithms}
return comm_group, algo_map, scratch
def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
"""Run allreduce in-place on buffer and return a copy of the result."""
ret = algo.execute(
comm=comm_group.communicator,
input_buffer=buffer.data.ptr,
output_buffer=buffer.data.ptr,
input_size=buffer.nbytes,
output_size=buffer.nbytes,
dtype=dtype,
op=ReduceOp.SUM,
stream=cp.cuda.get_current_stream().ptr,
nblocks=nblocks,
nthreads_per_block=nthreads_per_block,
symmetric_memory=True,
accum_dtype=accum_dtype,
)
cp.cuda.Device().synchronize()
assert ret == 0, f"Allreduce failed with error code {ret}"
return buffer.copy()
# ---------------------------------------------------------------------------
# Test: FP8 E4M3 accumulation correctness
# ---------------------------------------------------------------------------
@parametrize_mpi_groups(8)
@pytest.mark.parametrize(
"algo_name",
[
"default_allreduce_packet",
"default_allreduce_nvls_packet",
"default_allreduce_fullmesh",
"default_allreduce_rsag_zero_copy",
"default_allreduce_allpair_packet",
],
)
@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
"""Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
least as accurate as native FP8 accumulation, across all algorithm variants."""
rank = mpi_group.comm.rank
world_size = mpi_group.comm.size
comm_group, algo_map, scratch = setup_algorithms(mpi_group)
if algo_name not in algo_map:
pytest.skip(f"{algo_name} not available")
if "nvls" in algo_name and not is_nvls_supported():
pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
algo = algo_map[algo_name]
buf = GpuBuffer(size, dtype=cp.uint8)
accum_configs = [
("fp8_native", DataType.float8_e4m3),
("float16", DataType.float16),
("float32", DataType.float32),
]
# rsag_zero_copy and fullmesh need explicit block/thread counts
if "rsag" in algo_name:
nb = max(1, min(32, size // (world_size * 32)))
nt = 1024
elif "fullmesh" in algo_name:
nb = 35
nt = 512
else:
nb = 0
nt = 0
errors = {}
for accum_label, accum_dtype in accum_configs:
# Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
rng = np.random.RandomState(42 + rank)
src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
src_f32 = cp.clip(src_f32, -240.0, 240.0)
src_fp8 = float_to_e4m3fn(src_f32)
# Copy into symmetric buffer
buf[:] = src_fp8
cp.cuda.Device().synchronize()
# Run allreduce
result = run_allreduce(
algo,
comm_group,
buf,
dtype=DataType.float8_e4m3,
accum_dtype=accum_dtype,
nblocks=nb,
nthreads_per_block=nt,
)
result_f32 = e4m3fn_to_float(result)
# Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
ref_f32 = cp.zeros(size, dtype=cp.float32)
for r in range(world_size):
rng_r = np.random.RandomState(42 + r)
rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
rank_data = cp.clip(rank_data, -240.0, 240.0)
rank_data_fp8 = float_to_e4m3fn(rank_data)
ref_f32 += e4m3fn_to_float(rank_data_fp8)
# Compute errors
abs_err = cp.abs(result_f32 - ref_f32)
mean_abs_err = float(cp.mean(abs_err))
errors[accum_label] = mean_abs_err
# Reset between runs
algo.reset()
# Higher-precision accumulation should be at least as accurate as native fp8
assert (
errors["float16"] <= errors["fp8_native"] + 1e-6
), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
assert (
errors["float32"] <= errors["fp8_native"] + 1e-6
), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
# ---------------------------------------------------------------------------
# Test: FP8 E4M3B15 accumulation correctness
# ---------------------------------------------------------------------------
@parametrize_mpi_groups(8)
@pytest.mark.parametrize(
"algo_name",
[
"default_allreduce_packet",
"default_allreduce_nvls_packet",
"default_allreduce_rsag_zero_copy",
"default_allreduce_fullmesh",
"default_allreduce_allpair_packet",
],
)
@pytest.mark.parametrize("size", [1024, 4096, 65536])
def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
"""Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
least as accurate as native E4M3B15 accumulation."""
rank = mpi_group.comm.rank
world_size = mpi_group.comm.size
comm_group, algo_map, scratch = setup_algorithms(mpi_group)
if algo_name not in algo_map:
pytest.skip(f"{algo_name} not available")
if "nvls" in algo_name and not is_nvls_supported():
pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
algo = algo_map[algo_name]
buf = GpuBuffer(size, dtype=cp.uint8)
accum_configs = [
("e4m3b15_native", DataType.float8_e4m3b15),
("float16", DataType.float16),
("float32", DataType.float32),
]
# rsag_zero_copy needs explicit block/thread counts, scaled to data size
if "rsag" in algo_name:
nb = max(1, min(32, size // (world_size * 32)))
nt = 1024
else:
nb = 0
nt = 0
errors = {}
for accum_label, accum_dtype in accum_configs:
# Generate deterministic per-rank random uint8 values in valid e4m3b15 range
rng = np.random.RandomState(42 + rank)
raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
src_uint8 = raw | signs
# Fix negative zero -> positive zero
src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
# Copy into symmetric buffer
buf[:] = src_uint8
cp.cuda.Device().synchronize()
# Run allreduce
result = run_allreduce(
algo,
comm_group,
buf,
dtype=DataType.float8_e4m3b15,
accum_dtype=accum_dtype,
nblocks=nb,
nthreads_per_block=nt,
)
# Decode result
result_f32 = e4m3b15_to_float(result)
# Compute float32 reference
ref_f32 = cp.zeros(size, dtype=cp.float32)
for r in range(world_size):
rng_r = np.random.RandomState(42 + r)
raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
bits_r = raw_r | signs_r
bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
ref_f32 += e4m3b15_to_float(bits_r)
# Clamp reference to e4m3b15 representable range
ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
# Compute errors (only on valid entries)
valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
errors[accum_label] = mean_abs_err
algo.reset()
# Higher-precision accumulation should be at least as accurate as native
assert (
errors["float16"] <= errors["e4m3b15_native"] + 1e-8
), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
assert (
errors["float32"] <= errors["e4m3b15_native"] + 1e-8
), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"

View File

@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
if(MLX5_FOUND)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
endif()
endif()
if(MSCCLPP_USE_GDRCOPY)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
endif()
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})

View File

@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
DataType accumDtype) {
if (accumDtype == DataType::AUTO) accumDtype = dtype;
if (!initialized_) {
initFunc_(comm);
initialized_ = true;
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
contexts_[ctxKey] = ctx;
}
return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
nThreadsPerBlock, extras);
nThreadsPerBlock, extras, accumDtype);
}
const std::string& NativeAlgorithm::name() const { return name_; }
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
void NativeAlgorithm::reset() {
contexts_.clear();
initialized_ = false;
}
void NativeAlgorithm::reset() { contexts_.clear(); }
void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
std::shared_ptr<Algorithm> algorithm) {
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
std::shared_ptr<Executor> executor, int, int, bool,
const std::unordered_map<std::string, uintptr_t>&) {
const std::unordered_map<std::string, uintptr_t>&, DataType) {
if (!executor) {
THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
}
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
plan_, stream);
break;
#endif
case DataType::FLOAT8_E4M3B15:
executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
DataType::FLOAT8_E4M3B15, plan_, stream);
break;
case DataType::INT32:
case DataType::UINT32:
executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);

View File

@@ -7,6 +7,7 @@
#include <mscclpp/npkit/npkit.hpp>
#endif
#include <mscclpp/atomic_device.hpp>
#include <mscclpp/numa.hpp>
#include <mscclpp/utils.hpp>
#include <sstream>
@@ -197,45 +198,54 @@ void IBConnection::recvThreadFunc() {
}
}
// Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
uint32_t lastImmData = 0;
uint64_t immHighBits = 0;
uint64_t newValueHost = 0;
while (!stopRecvThread_.load(std::memory_order_relaxed)) {
auto qp = qp_.lock();
if (!qp) break;
auto qp = qp_.lock();
if (!qp) return;
while (!stopRecvThread_.load(std::memory_order_relaxed)) {
int wcNum = qp->pollRecvCq();
if (wcNum < 0) {
WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
recvThreadErrorMsg_ = "pollRecvCq failed";
recvThreadError_.store(true, std::memory_order_release);
WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
break;
}
for (int i = 0; i < wcNum; ++i) {
int status = qp->getRecvWcStatus(i);
if (status != static_cast<int>(WsStatus::Success)) {
WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
// Post another recv to replace the failed one
qp->stageRecv(/*wrId=*/0);
qp->postRecv();
continue;
// A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
// All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
// and set the error flag so the main thread can detect it.
recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
recvThreadError_.store(true, std::memory_order_release);
WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
return;
}
// The imm_data contains newValue (32-bit, extended to 64-bit)
// Note: getRecvWcImmData already converts from network byte order via ntohl
unsigned int immData = qp->getRecvWcImmData(i);
newValueHost = static_cast<uint64_t>(immData);
// Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
// using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
// are less than the previous value, the upper 32 bits must have incremented by 1.
uint32_t immData = qp->getRecvWcImmData(i);
if (immData < lastImmData) {
immHighBits += (1ULL << 32);
}
lastImmData = immData;
newValueHost = immHighBits | static_cast<uint64_t>(immData);
// Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
uint64_t dstGpuAddr = remoteUpdateDstAddr_;
if (dstGpuAddr != 0) {
uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
// Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
MSCCLPP_CUDATHROW(
cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
")");
// Forward the token to the semaphore's inbound token address via atomicStore
// through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
if (signalAddr_ != 0) {
if (signalGdrMap_ && signalGdrMap_->valid()) {
atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
} else {
// For HIP/ROCm.
// NOTE: may need a fix in the future to ensure BAR1 mapping.
*reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
}
}
// Post another recv for future messages
@@ -250,60 +260,105 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
: BaseConnection(context, localEndpoint),
transport_(localEndpoint.transport()),
remoteTransport_(remoteEndpoint.transport()),
dummyAtomicSource_(std::make_unique<uint64_t>(0)),
atomicSrc_(std::make_unique<uint64_t>(0)),
ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
gdrSignalForwarding_(false),
stopRecvThread_(false),
recvThreadError_(false),
localGpuDeviceId_(localEndpoint.device().id),
signalStream_(nullptr),
remoteUpdateDstAddr_(0) {
signalAddr_(0) {
qp_ = getImpl(localEndpoint).ibQp_;
qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
qp_.lock()->rts();
dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
validateTransport(dummyAtomicSourceMem_, transport_);
dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
validateTransport(atomicSrcMem_, transport_);
atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
if (ibNoAtomic_) {
// Create a CUDA stream for async memory copies
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
#if defined(MSCCLPP_USE_CUDA)
// On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
if (!gdrEnabled()) {
THROW(CONN, Error, ErrorCode::InvalidUsage,
"IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
}
gdrSignalForwarding_ = true;
#endif // defined(MSCCLPP_USE_CUDA)
// Pre-post receive requests for incoming write-with-imm
// On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
// NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
// routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
// on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
// aarch64), HostNoAtomic works without Data Direct.
//
// We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
// when the device is not a VF. If data corruption occurs, switching to VF devices with
// Data Direct or using IbMode::Host with RDMA atomics will resolve it.
{
IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
if (!ibCtx->isVirtualFunction()) {
WARN(CONN,
"IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
"on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
"Device ",
ibCtx->getDevName(),
" is not a VF. "
"If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
}
}
// Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
// The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
auto qp = qp_.lock();
int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
for (int i = 0; i < maxRecvWr; ++i) {
qp->stageRecv(/*wrId=*/0);
}
qp->postRecv();
// Start the background thread to poll recv CQ
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
// The recv thread is started later in startSignalForwarding() when the semaphore
// provides the signal forwarding destination. This ensures the thread lifetime is
// bounded by the GdrMap lifetime (created before start, destroyed after stop).
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
} else {
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
}
}
IBConnection::~IBConnection() {
if (ibNoAtomic_) {
stopRecvThread_.store(true, std::memory_order_relaxed);
if (recvThread_.joinable()) {
recvThread_.join();
}
if (signalStream_ != nullptr) {
// Synchronize stream to ensure all async copies are complete before destruction
// Ignore errors during teardown (CUDA context may already be destroyed)
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
}
}
}
IBConnection::~IBConnection() { stopSignalForwarding(); }
Transport IBConnection::transport() const { return transport_; }
Transport IBConnection::remoteTransport() const { return remoteTransport_; }
void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
remoteUpdateDstAddr_ = addr;
INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
// Set up the forwarding destination and GdrMap, then start the recv thread.
// Order: set address → create GdrMap → start thread.
signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
if (gdrSignalForwarding_) {
signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
}
if (ibNoAtomic_) {
stopRecvThread_.store(false, std::memory_order_relaxed);
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
}
INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
}
void IBConnection::stopSignalForwarding() {
// Stop the recv thread, then tear down GdrMap and address.
// Order: stop thread → destroy GdrMap → clear address.
if (ibNoAtomic_) {
stopRecvThread_.store(true, std::memory_order_relaxed);
if (recvThread_.joinable()) {
recvThread_.join();
}
}
if (gdrSignalForwarding_) {
signalGdrMap_.reset();
}
signalAddr_ = 0;
INFO(CONN, "IBConnection stopSignalForwarding");
}
void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
@@ -356,25 +411,29 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
*src = newValue;
if (ibNoAtomic_) {
// Use RDMA write-with-imm instead of atomic operation
// Send only newValue in imm_data (0-byte write)
// The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
// Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
// Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
// token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
// detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
// indicates the upper 32 bits incremented by 1).
if (newValue <= oldValue) {
WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
} else if (newValue - oldValue >= (1ULL << 32)) {
WARN(CONN,
"IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
" -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
}
unsigned int immData = static_cast<unsigned int>(newValue);
// Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
/*size=*/0, /*wrId=*/0,
/*srcOffset=*/0, /*dstOffset=*/0,
/*signaled=*/true, /*immData=*/immData);
qp_.lock()->postSend();
INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
} else {
qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
/*signaled=*/true);
qp_.lock()->postSend();
INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
" -> ", newValue);
}
@@ -388,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
#endif
// Check if the recv thread has already reported an error (e.g., QP entered error state).
if (recvThreadError_.load(std::memory_order_acquire)) {
THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
}
Timer timer;
while (qp_.lock()->getNumSendCqItems()) {
int wcNum = qp_.lock()->pollSendCq();

View File

@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
}
}
Context::Impl::Impl() {}
IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
// Find IB context or create it
auto it = ibContexts_.find(ibTransport);

View File

@@ -47,11 +47,16 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
}
}
// Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
if (config_.ib.gidIndex < 0) {
config_.ib.gidIndex = env()->ibGidIndex;
}
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
ibQp_ = contextImpl.getIbContext(config_.transport)
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
ibQpInfo_ = ibQp_->getInfo();
} else if (config_.transport == Transport::Ethernet) {
// Configuring Ethernet Interfaces
@@ -74,6 +79,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
if (AllIBTransports.has(config_.transport)) {
ibLocal_ = false;
it = detail::deserialize(it, ibQpInfo_);
it = detail::deserialize(it, ibNoAtomic_);
} else if (config_.transport == Transport::Ethernet) {
it = detail::deserialize(it, socketAddress_);
}
@@ -103,6 +109,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
detail::serialize(data, pimpl_->pidHash_);
if (AllIBTransports.has(pimpl_->config_.transport)) {
detail::serialize(data, pimpl_->ibQpInfo_);
detail::serialize(data, pimpl_->ibNoAtomic_);
} else if (pimpl_->config_.transport == Transport::Ethernet) {
detail::serialize(data, pimpl_->socketAddress_);
}

View File

@@ -66,6 +66,7 @@ Env::Env()
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", -1)) {}
std::shared_ptr<Env> env() {
@@ -94,6 +95,7 @@ std::shared_ptr<Env> env() {
logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
}
return globalEnv;

View File

@@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
case DataType::FLOAT8_E5M2:
// FP8 is not supported in CUDA execution kernel.
break;
case DataType::FLOAT8_E4M3B15:
// fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
break;
case DataType::AUTO:
// AUTO is a sentinel resolved before reaching this point; nothing to do.
break;
}
}

204
src/core/gdr.cc Normal file
View File

@@ -0,0 +1,204 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "gdr.hpp"
#if defined(MSCCLPP_USE_GDRCOPY)
#include <gdrapi.h>
#include <unistd.h>
#include <mscclpp/env.hpp>
#include <mscclpp/gpu_utils.hpp>
#include "logger.hpp"
#ifndef GPU_PAGE_SHIFT
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
#endif
namespace mscclpp {
// GdrContext
class GdrContext {
public:
GdrContext();
~GdrContext();
GdrContext(const GdrContext&) = delete;
GdrContext& operator=(const GdrContext&) = delete;
GdrStatus status() const { return status_; }
gdr_t handle() const { return handle_; }
private:
GdrStatus status_;
gdr_t handle_;
};
static std::shared_ptr<GdrContext> gdrContext() {
static auto instance = std::make_shared<GdrContext>();
return instance;
}
GdrStatus gdrStatus() { return gdrContext()->status(); }
bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
const char* gdrStatusMessage() {
switch (gdrStatus()) {
case GdrStatus::Ok:
return "GDRCopy initialized successfully";
case GdrStatus::NotBuilt:
return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
case GdrStatus::Disabled:
return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
case GdrStatus::DriverMissing:
return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
case GdrStatus::OpenFailed:
return "gdr_open() failed; GDRCopy driver may be misconfigured";
default:
return "unknown GDRCopy status";
}
}
GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
if (env()->forceDisableGdr) {
INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
status_ = GdrStatus::Disabled;
return;
}
// Auto-detect: check if driver is available
if (access("/dev/gdrdrv", F_OK) != 0) {
INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
status_ = GdrStatus::DriverMissing;
return;
}
handle_ = gdr_open();
if (handle_ == nullptr) {
INFO(GPU, "gdr_open() failed, disabling GDRCopy");
status_ = GdrStatus::OpenFailed;
return;
}
status_ = GdrStatus::Ok;
INFO(GPU, "GDRCopy initialized successfully");
}
GdrContext::~GdrContext() {
if (handle_ != nullptr) {
gdr_close(handle_);
handle_ = nullptr;
}
}
// GdrMap::Impl — real implementation with GDRCopy
struct GdrMap::Impl {
std::shared_ptr<GdrContext> ctx;
std::shared_ptr<void> gpuMem;
gdr_mh_t mh;
void* barPtr;
uint64_t* hostDstPtr;
size_t mappedSize;
};
GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
pimpl_->ctx = gdrContext();
pimpl_->gpuMem = std::move(gpuMem);
pimpl_->mh = {};
pimpl_->barPtr = nullptr;
pimpl_->hostDstPtr = nullptr;
pimpl_->mappedSize = 0;
// Ensure CUDA device context is active for gdr_pin_buffer
CudaDeviceGuard deviceGuard(deviceId);
uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
// Align to GPU page boundary and pin one page around the target address
unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
unsigned long pageOffset = gpuAddr - alignedAddr;
pimpl_->mappedSize = GPU_PAGE_SIZE;
// Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
// ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
// not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
// system-scope acquire.
int ret =
gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
if (ret != 0) {
ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
if (ret != 0) {
THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
}
}
ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
if (ret != 0) {
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
}
pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
}
GdrMap::~GdrMap() {
if (pimpl_) {
if (pimpl_->barPtr != nullptr) {
(void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
}
if (pimpl_->hostDstPtr != nullptr) {
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
}
}
}
bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
void GdrMap::copyFrom(void* dst, size_t size) const {
gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
}
} // namespace mscclpp
#else // !defined(MSCCLPP_USE_GDRCOPY)
namespace mscclpp {
GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
bool gdrEnabled() { return false; }
const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
// GdrMap::Impl — stub (no GDRCopy)
struct GdrMap::Impl {};
GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
GdrMap::~GdrMap() = default;
bool GdrMap::valid() const { return false; }
uint64_t* GdrMap::hostPtr() const { return nullptr; }
void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
} // namespace mscclpp
#endif // !defined(MSCCLPP_USE_GDRCOPY)

View File

@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
::close(handle->posixFd.fd);
}
if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
if (handle->fabric.allocHandle != 0) {
cuMemRelease(handle->fabric.allocHandle);
}
}
delete handle;
}
}
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
handle->typeFlags = GpuIpcMemHandle::Type::None;
handle->posixFd.fd = -1;
handle->fabric.allocHandle = {};
CUdeviceptr basePtr;
size_t sz;
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
// FABRIC handle
if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
CUDA_SUCCESS) {
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
}
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
handle->offsetFromBase = 0;
handle->typeFlags = GpuIpcMemHandle::Type::None;
handle->posixFd.fd = -1;
handle->fabric.allocHandle = {};
// POSIX FD handle
int fileDesc;
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
handle->fabric.allocHandle = allocHandle;
}
if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
}
// Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
// multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
// Only release allocHandle if it is not stored in fabric.allocHandle.
if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
}
return handle;
#else // !(CUDA_NVLS_API_AVAILABLE)
THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
CUDA_SUCCESS) {
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
handle_.fabric.allocHandle = {};
type_ = GpuIpcMemHandle::Type::Fabric;
}
}

View File

@@ -21,6 +21,9 @@
#include "context.hpp"
#if defined(USE_IBVERBS)
#include "ibverbs_wrapper.hpp"
#if defined(MSCCLPP_USE_MLX5DV)
#include "mlx5dv_wrapper.hpp"
#endif // defined(MSCCLPP_USE_MLX5DV)
#endif // defined(USE_IBVERBS)
#include "logger.hpp"
@@ -64,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
return ret;
}
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
if (size == 0) {
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
}
@@ -80,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
bool isGpuBuff = (gpuId != -1);
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
#if !defined(MSCCLPP_USE_ROCM)
int fd;
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
int fd = -1;
size_t rangeSize = pages * pageSize;
// Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
// bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
// routes DMA through the Data Direct engine for correct ordering and higher throughput.
// Fall back to the default (non-PCIe) mapping if the flag is unsupported.
#if (CUDA_VERSION >= 12030)
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
if (cuRes != CUDA_SUCCESS || fd < 0) {
if (fd >= 0) ::close(fd);
fd = -1;
}
bool usedPcieFlag = (fd >= 0);
#endif // CUDA_VERSION >= 12030
if (fd < 0) {
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
}
// Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
// which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
size_t offsetInDmaBuf = buffIntPtr % pageSize;
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
#if defined(MSCCLPP_USE_MLX5DV)
if (isDataDirect) {
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
#endif
if (mr_ == nullptr) {
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
// If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
#if (CUDA_VERSION >= 12030)
if (mr_ == nullptr && usedPcieFlag) {
::close(fd);
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
}
#endif // CUDA_VERSION >= 12030
::close(fd);
if (mr_ == nullptr) {
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
@@ -131,7 +171,7 @@ const void* IbMr::getBuff() const { return buff_; }
uint32_t IbMr::getLkey() const { return mr_->lkey; }
IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
int maxSendWr, int maxRecvWr, int maxWrPerSend)
int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
: portNum_(portNum),
gidIndex_(gidIndex),
info_(),
@@ -151,7 +191,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
maxSendCqPollNum_(maxSendCqPollNum),
maxSendWr_(maxSendWr),
maxWrPerSend_(maxWrPerSend),
maxRecvWr_(maxRecvWr) {
maxRecvWr_(maxRecvWr),
noAtomic_(noAtomic) {
sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
if (sendCq_ == nullptr) {
THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
@@ -211,7 +252,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
qpAttr.qp_state = IBV_QPS_INIT;
qpAttr.pkey_index = 0;
qpAttr.port_num = portNum_;
qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
: (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
}
@@ -240,7 +282,7 @@ void IbQp::rtr(const IbQpInfo& info) {
qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
qp_attr.dest_qp_num = info.qpn;
qp_attr.rq_psn = 0;
qp_attr.max_dest_rd_atomic = 1;
qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
qp_attr.min_rnr_timer = 0x12;
if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
qp_attr.ah_attr.is_global = 1;
@@ -272,7 +314,7 @@ void IbQp::rts() {
qp_attr.retry_cnt = 7;
qp_attr.rnr_retry = 7;
qp_attr.sq_psn = 0;
qp_attr.max_rd_atomic = 1;
qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
int ret = IBVerbs::ibv_modify_qp(
qp_, &qp_attr,
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
@@ -434,12 +476,38 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
IbCtx::IbCtx(const std::string& devName)
: devName_(devName),
ctx_(nullptr),
pd_(nullptr),
supportsRdmaAtomics_(false),
isMlx5_(false),
isDataDirect_(false),
isVF_(false) {
int num;
struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
for (int i = 0; i < num; ++i) {
if (std::string(devices[i]->name) == devName_) {
ctx_ = IBVerbs::ibv_open_device(devices[i]);
// Detect if this IB device is a Virtual Function (VF).
// VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
{
std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
if (isVF_) {
INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
}
}
#if defined(MSCCLPP_USE_MLX5DV)
if (MLX5DV::isAvailable()) {
isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
if (isMlx5_) {
INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
}
}
#endif // defined(MSCCLPP_USE_MLX5DV)
break;
}
}
@@ -452,6 +520,20 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
}
// Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
#if defined(MSCCLPP_USE_MLX5DV)
if (isMlx5_ && MLX5DV::isAvailable()) {
char sysfsPath[256];
int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
if (ret == 0) {
isDataDirect_ = true;
INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
} else {
INFO(NET, "IB device ", devName_, " does not support Data Direct");
}
}
#endif // defined(MSCCLPP_USE_MLX5DV)
// Query and cache RDMA atomics capability
struct ibv_device_attr attr = {};
if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
@@ -512,7 +594,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
}
std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
int maxRecvWr, int maxWrPerSend) {
int maxRecvWr, int maxWrPerSend, bool noAtomic) {
if (port == -1) {
port = this->getAnyUsablePort(gidIndex);
if (port == -1) {
@@ -521,16 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
} else if (!this->isPortUsable(port, gidIndex)) {
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
}
return std::shared_ptr<IbQp>(
new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
maxRecvWr, maxWrPerSend, noAtomic));
}
std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
}
bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
bool IbCtx::isMlx5() const { return isMlx5_; }
bool IbCtx::isDataDirect() const { return isDataDirect_; }
bool IbCtx::isVirtualFunction() const { return isVF_; }
MSCCLPP_API_CPP int getIBDeviceCount() {
int num;
IBVerbs::ibv_get_device_list(&num);

View File

@@ -5,6 +5,7 @@
#define MSCCLPP_CONNECTION_HPP_
#include <atomic>
#include <memory>
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_utils.hpp>
#include <mutex>
@@ -15,6 +16,7 @@
#include "communicator.hpp"
#include "context.hpp"
#include "endpoint.hpp"
#include "gdr.hpp"
#include "ib.hpp"
#include "registered_memory.hpp"
#include "socket.h"
@@ -35,11 +37,18 @@ class BaseConnection {
virtual void flush(int64_t timeoutUsec = -1) = 0;
/// Set the local address where remote updateAndSync operations should write.
/// This is called by the receiver to specify where incoming signals should be written.
/// Default implementation is a no-op for connections that don't need it.
/// @param addr The local address for incoming writes.
virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
/// Start signal forwarding to the given memory address.
/// Called by the semaphore to specify where incoming signals should be written.
/// @param mem Shared pointer to the GPU memory for the signal token.
virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
/// Stop signal forwarding and release associated resources.
virtual void stopSignalForwarding() {}
/// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
/// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
/// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
virtual bool isSignalForwarding() const { return false; }
virtual Transport transport() const = 0;
@@ -91,22 +100,29 @@ class IBConnection : public BaseConnection {
Transport transport_;
Transport remoteTransport_;
std::weak_ptr<IbQp> qp_;
std::unique_ptr<uint64_t> dummyAtomicSource_; // not used anywhere but IB needs a source
RegisteredMemory dummyAtomicSourceMem_;
mscclpp::TransportInfo dstTransportInfo_;
std::unique_ptr<uint64_t> atomicSrc_;
RegisteredMemory atomicSrcMem_;
mscclpp::TransportInfo atomicSrcTransportInfo_;
// For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
// instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
bool ibNoAtomic_;
bool gdrSignalForwarding_; // ibNoAtomic_ && gdrEnabled() — decided once at construction
std::thread recvThread_;
std::atomic<bool> stopRecvThread_;
int localGpuDeviceId_; // Local GPU device ID for setting CUDA context in recv thread
cudaStream_t signalStream_;
std::atomic<bool> recvThreadError_; // Set by recv thread on fatal error
std::string recvThreadErrorMsg_; // Error message from recv thread (written before recvThreadError_ is set)
int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping
// Write-with-imm design:
// - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
// - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
uint64_t remoteUpdateDstAddr_;
// Signal forwarding design (HostNoAtomic mode):
// - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
// - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
// the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
// detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
// incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
uint64_t signalAddr_;
std::unique_ptr<GdrMap> signalGdrMap_;
void recvThreadFunc();
@@ -114,10 +130,15 @@ class IBConnection : public BaseConnection {
IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
~IBConnection();
/// Set the local address where remote updateAndSync operations will write.
/// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
/// @param addr The local address for incoming writes.
void setRemoteUpdateDstAddr(uint64_t addr) override;
/// Start signal forwarding to the given memory address.
/// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
/// @param mem Shared pointer to the GPU memory for the signal token.
void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
/// Stop signal forwarding and release associated resources.
void stopSignalForwarding() override;
bool isSignalForwarding() const override;
Transport transport() const override;

View File

@@ -42,8 +42,6 @@ struct Context::Impl {
std::shared_ptr<TokenPool> tokenPool_;
const size_t maxNumTokens_ = 1 << 15; // 32K tokens
Impl();
IbCtx* getIbContext(Transport ibTransport);
std::shared_ptr<uint64_t> getToken();
};

View File

@@ -210,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
sizeof(int4);
void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
tmp = cal_vector<T, OpType>(tmp, val);
tmp = calVector<T, OpType>(tmp, val);
}
output4[outputOffset4 + idx] = tmp;
if constexpr (SendToRemote) {
@@ -353,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
for (uint32_t index = 0; index < nSrcs; ++index) {
PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
PacketPayload<PacketType> val = pkt[idx].read(flag_);
data = cal_vector<T, OpType>(data, val);
data = calVector<T, OpType>(data, val);
}
data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
data = calVector<T, OpType>(data, srcPacketPayload[idx]);
dstPacketPayload[idx] = data;
if constexpr (SendToRemote) {
@@ -394,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
for (uint32_t index = 0; index < nSrcs; ++index) {
PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
PacketPayload<PacketType> val = pkt[idx].read(flag_);
data = cal_vector<T, OpType>(data, val);
data = calVector<T, OpType>(data, val);
}
data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
data = calVector<T, OpType>(data, srcPacketPayload[idx]);
dstPacketPayload[idx] = data;
PacketType* dst_val = &dstPkt[idx];
dst_val->write(data, flag_);
@@ -464,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
size_t buffOffset =
(inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
int4 val = buff4[buffOffset + idx];
tmp = cal_vector<T, OpType>(tmp, val);
tmp = calVector<T, OpType>(tmp, val);
}
dst4[dstOffset4 + idx] = tmp;
if constexpr (SendToRemote) {
@@ -899,6 +899,17 @@ class ExecutionKernel {
#endif
break;
#endif // __FP8_TYPES_EXIST__
case DataType::FLOAT8_E4M3B15:
executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
plan, semaphores, localMemoryIdBegin, flag
#if defined(ENABLE_NPKIT)
,
NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
#else
);
#endif
break;
case DataType::UINT8:
executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
@@ -910,6 +921,10 @@ class ExecutionKernel {
);
#endif
break;
case DataType::AUTO:
// AUTO is a sentinel that must be resolved before reaching this point.
assert(false && "DataType::AUTO must be resolved before kernel launch");
break;
}
}
#else // !defined(MSCCLPP_DEVICE_HIP)

62
src/core/include/gdr.hpp Normal file
View File

@@ -0,0 +1,62 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#ifndef MSCCLPP_GDR_HPP_
#define MSCCLPP_GDR_HPP_
#include <cstddef>
#include <cstdint>
#include <memory>
namespace mscclpp {
enum class GdrStatus {
Ok, // GDRCopy initialized successfully
NotBuilt, // Built without MSCCLPP_USE_GDRCOPY
Disabled, // Disabled via MSCCLPP_FORCE_DISABLE_GDR
DriverMissing, // /dev/gdrdrv not found
OpenFailed, // gdr_open() failed
};
/// Return the detailed status of the global GDRCopy context.
GdrStatus gdrStatus();
/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
bool gdrEnabled();
/// Return a human-readable error message for the current GDRCopy status.
const char* gdrStatusMessage();
/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
class GdrMap {
public:
/// Pin and map a GPU address for direct host-side access.
/// @param gpuMem Shared pointer to the GPU memory (e.g. from gpuCallocShared).
/// @param deviceId The CUDA device ID for setting context.
GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
~GdrMap();
GdrMap(const GdrMap&) = delete;
GdrMap& operator=(const GdrMap&) = delete;
/// Whether the mapping was established successfully.
bool valid() const;
/// Return the BAR1-mapped host pointer to the GPU location.
uint64_t* hostPtr() const;
/// Copy data from host memory to the mapped GPU location.
void copyTo(const void* src, size_t size);
/// Copy data from the mapped GPU location to host memory.
void copyFrom(void* dst, size_t size) const;
private:
struct Impl;
std::unique_ptr<Impl> pimpl_;
};
} // namespace mscclpp
#endif // MSCCLPP_GDR_HPP_

View File

@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {
struct {
char handle[64];
CUmemGenericAllocationHandle allocHandle;
} fabric;
static void deleter(GpuIpcMemHandle* handle);

View File

@@ -36,7 +36,7 @@ class IbMr {
uint32_t getLkey() const;
private:
IbMr(ibv_pd* pd, void* buff, std::size_t size);
IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
ibv_mr* mr_;
void* buff_;
@@ -101,7 +101,7 @@ class IbQp {
};
IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
int maxRecvWr, int maxWrPerSend);
int maxRecvWr, int maxWrPerSend, bool noAtomic);
SendWrInfo getNewSendWrInfo();
RecvWrInfo getNewRecvWrInfo();
@@ -128,6 +128,7 @@ class IbQp {
const int maxSendWr_;
const int maxWrPerSend_;
const int maxRecvWr_;
const bool noAtomic_;
friend class IbCtx;
};
@@ -139,18 +140,24 @@ class IbCtx {
~IbCtx();
std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
int maxRecvWr, int maxWrPerSend);
int maxRecvWr, int maxWrPerSend, bool noAtomic);
std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
bool supportsRdmaAtomics() const;
bool isMlx5() const;
bool isDataDirect() const;
bool isVirtualFunction() const;
#else
IbCtx([[maybe_unused]] const std::string& devName) {}
~IbCtx() {}
std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
return nullptr;
}
bool supportsRdmaAtomics() const { return false; }
bool isMlx5() const { return false; }
bool isDataDirect() const { return false; }
bool isVirtualFunction() const { return false; }
#endif
const std::string& getDevName() const { return devName_; };
@@ -163,6 +170,9 @@ class IbCtx {
ibv_context* ctx_;
ibv_pd* pd_;
bool supportsRdmaAtomics_;
bool isMlx5_;
bool isDataDirect_;
bool isVF_;
};
} // namespace mscclpp

View File

@@ -0,0 +1,38 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
#define MSCCLPP_MLX5DV_WRAPPER_HPP_
#if defined(MSCCLPP_USE_MLX5DV)
#include <infiniband/verbs.h>
#include <string>
namespace mscclpp {
struct MLX5DV {
/// Whether libmlx5.so was successfully loaded at runtime.
static bool isAvailable();
/// Check if the given IB device supports mlx5 Direct Verbs.
static bool mlx5dv_is_supported(struct ibv_device* device);
/// Register a DMABUF memory region using mlx5dv extensions.
/// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
int access);
/// Query the Data Direct sysfs path for the given IB context.
/// Returns 0 on success (device supports Data Direct), non-zero otherwise.
static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
private:
static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
};
} // namespace mscclpp
#endif // defined(MSCCLPP_USE_MLX5DV)
#endif // MSCCLPP_MLX5DV_WRAPPER_HPP_

View File

@@ -14,7 +14,7 @@ namespace mscclpp {
// Generic element-wise calculation helper
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
if constexpr (OpType == SUM) {
return a + b;
} else if constexpr (OpType == MIN) {
@@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
}
// Generic vector reduction helpers
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
int4 ret;
ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
return ret;
}
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
uint2 ret;
ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
return ret;
}
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
template <>
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
f32x2 fa = bit_cast<f32x2, uint2>(a);
f32x2 fb = bit_cast<f32x2, uint2>(b);
f32x2 fr = fa + fb;
return bit_cast<uint2, f32x2>(fr);
}
template <>
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
f32x2 fa = bit_cast<f32x2, uint2>(a);
f32x2 fb = bit_cast<f32x2, uint2>(b);
f32x2 fr = mscclpp::min(fa, fb);
return bit_cast<uint2, f32x2>(fr);
}
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
int4 ret;
ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
return ret;
}
// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
template <>
MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
}
template <>
MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
}
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
}
template <typename T, ReduceOp OpType>
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
}
/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
template <>
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
float fa = bit_cast<float, uint32_t>(a);
float fb = bit_cast<float, uint32_t>(b);
return bit_cast<uint32_t, float>(fa + fb);
}
template <>
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
float fa = bit_cast<float, uint32_t>(a);
float fb = bit_cast<float, uint32_t>(b);
return bit_cast<uint32_t, float>(fminf(fa, fb));
}
// calVector wrapper converts scalar types to vector types and calls calVectorHelper
template <typename T, ReduceOp OpType, typename DataType>
MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
// Define the vectorized computation type based on the element type
static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
using CompType = typename std::conditional_t<
std::is_same_v<T, __half>, f16x2,
std::is_same_v<T, float>, f32x2,
std::conditional_t<
std::is_same_v<T, __bfloat16>, bf16x2,
std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
std::is_same_v<T, __half>, f16x2,
std::conditional_t<
std::is_same_v<T, __bfloat16>, bf16x2,
std::conditional_t<
std::is_same_v<T, uint8_t>, u8x4,
std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
#if defined(__FP8_TYPES_EXIST__)
std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
#endif
T
#if defined(__FP8_TYPES_EXIST__)
>>>>>;
std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
#else
>>>;
T
#endif
return cal_vector_helper<CompType, OpType>(a, b);
>>>>>;
return calVectorHelper<CompType, OpType>(a, b);
}
/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
/// When AccumT == T, this is a no-op identity.
template <typename T, typename AccumT, typename AccDataType, typename DataType>
MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
if constexpr (std::is_same_v<T, AccumT>) {
return val;
} else {
constexpr int nElems = sizeof(DataType) / sizeof(T);
using FromVec = VectorType<T, nElems>;
using ToVec = VectorType<AccumT, nElems>;
ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
return reinterpret_cast<const AccDataType&>(result);
}
}
/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
/// Uses the optimized to<>() specializations when available.
/// When AccumT == T, this is a no-op identity.
template <typename T, typename AccumT, typename DataType, typename AccDataType>
MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
if constexpr (std::is_same_v<T, AccumT>) {
return val;
} else {
constexpr int nElems = sizeof(DataType) / sizeof(T);
using FromVec = VectorType<T, nElems>;
using ToVec = VectorType<AccumT, nElems>;
FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
return reinterpret_cast<const DataType&>(result);
}
}
/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
/// When AccumT == T, falls back to the standard calVector.
/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
if constexpr (std::is_same_v<T, AccumT>) {
return calVector<T, OpType>(acc, val);
} else {
constexpr int nElems = sizeof(DataType) / sizeof(T);
using FromVec = VectorType<T, nElems>;
using ToVec = VectorType<AccumT, nElems>;
ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
ToVec fr;
#pragma unroll
for (int i = 0; i < nElems; ++i) {
fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
}
return reinterpret_cast<const AccDataType&>(fr);
}
}
#endif // defined(MSCCLPP_DEVICE_COMPILE)

126
src/core/mlx5dv_wrapper.cc Normal file
View File

@@ -0,0 +1,126 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#if defined(MSCCLPP_USE_MLX5DV)
// _GNU_SOURCE is required for dlvsym()
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "mlx5dv_wrapper.hpp"
#include <dlfcn.h>
#include <infiniband/mlx5dv.h>
#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
#endif
#include <memory>
#include "logger.hpp"
namespace mscclpp {
static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
if (!globalMLX5Handle) {
const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
for (int i = 0; possibleLibNames[i] != nullptr; i++) {
void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
if (handle) {
globalMLX5Handle.reset(handle);
break;
}
}
if (!globalMLX5Handle) {
if (allowReturnNull) return nullptr;
THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
}
}
void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
if (!ptr && !allowReturnNull) {
THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
}
return ptr;
}
bool MLX5DV::isAvailable() {
static int available = -1;
if (available == -1) {
// Try to load the library; if it fails, mlx5dv is not available
const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
for (int i = 0; possibleLibNames[i] != nullptr; i++) {
void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
if (handle) {
if (!globalMLX5Handle) {
globalMLX5Handle.reset(handle);
} else {
::dlclose(handle);
}
available = 1;
INFO(NET, "libmlx5 loaded successfully");
return true;
}
}
available = 0;
DEBUG(NET, "libmlx5 not available");
}
return available == 1;
}
bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
using FuncType = bool (*)(struct ibv_device*);
static FuncType impl = nullptr;
if (!impl) {
void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
if (!ptr) return false;
impl = reinterpret_cast<FuncType>(ptr);
}
return impl(device);
}
struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
int access) {
// mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
// Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
static FuncType impl = nullptr;
static bool resolved = false;
if (!resolved) {
if (globalMLX5Handle) {
void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
if (!ptr) {
ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
}
impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
}
resolved = true;
}
if (!impl) return nullptr;
return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
}
int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
using FuncType = int (*)(struct ibv_context*, char*, size_t);
static FuncType impl = nullptr;
static bool resolved = false;
if (!resolved) {
if (globalMLX5Handle) {
void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
if (!ptr) {
ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
}
impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
}
resolved = true;
}
if (!impl) return -1;
return impl(context, buf, buf_len);
}
} // namespace mscclpp
#endif // defined(MSCCLPP_USE_MLX5DV)

View File

@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
else
return 25000;
#else
cudaDeviceProp dev_prop;
int clockRate;
MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
return dev_prop.clockRate;
MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
return clockRate;
#endif
}
#endif

View File

@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
}
}
} else if (transports.has(Transport::CudaIpc)) {
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
// try CudaIpc first and fall back to IB on failure.
auto entry = getTransportInfo(Transport::CudaIpc);
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
bool hasIB = (transports & AllIBTransports).any();
try {
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
} catch (const BaseError& e) {
if (!hasIB) {
throw;
}
bool isSameHost = (getHostHash() == this->hostHash);
if (isSameHost) {
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
} else {
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
}
}
}
if (this->data != nullptr) {
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);

View File

@@ -8,6 +8,7 @@
#include "atomic.hpp"
#include "connection.hpp"
#include "context.hpp"
#include "logger.hpp"
#include "registered_memory.hpp"
#include "serialization.hpp"
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
token_ = std::make_shared<uint64_t>(0);
} else if (localDevice.type == DeviceType::GPU) {
if (localDevice.id < 0) {
throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
}
CudaDeviceGuard deviceGuard(localDevice.id);
token_ = gpuCallocToken(connection_.context());
} else {
throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
}
idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
}
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
auto it = detail::deserialize(memEnd, device);
if (it != data.end()) {
throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
}
return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
}
@@ -119,15 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
outboundToken_(std::make_unique<uint64_t>()) {
if (connection().localDevice().type != DeviceType::GPU) {
throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
}
BaseConnection::getImpl(connection())
->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
auto connImpl = BaseConnection::getImpl(connection());
if (connImpl->isSignalForwarding()) {
// Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
// WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
CudaDeviceGuard deviceGuard(connection().localDevice().id);
#if defined(MSCCLPP_USE_ROCM)
inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
#else
inboundToken_ = detail::gpuCallocShared<uint64_t>();
#endif
connImpl->startSignalForwarding(inboundToken_);
}
// When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
// and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
}
MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
: Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
if (inboundToken_) {
// Clear the connection's signal forwarding destination (and GdrMap)
// before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
BaseConnection::getImpl(connection())->stopSignalForwarding();
}
}
MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
@@ -136,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
Host2DeviceSemaphore::DeviceHandle device;
device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
// If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
// Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
// which is the same address targeted by the NIC's atomic operation.
device.inboundToken =
inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
device.expectedInboundToken = expectedInboundToken_.get();
return device;
}
@@ -146,13 +171,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
expectedInboundToken_(std::make_unique<uint64_t>()),
outboundToken_(std::make_unique<uint64_t>()) {
if (connection().transport() == Transport::CudaIpc) {
throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
}
if (connection().localDevice().type != DeviceType::CPU) {
throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
}
auto connImpl = BaseConnection::getImpl(connection());
if (connImpl->isSignalForwarding()) {
// Signal forwarding mode: tell the recv thread where to write the incoming token.
// Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
auto token =
std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
connImpl->startSignalForwarding(std::move(token));
}
BaseConnection::getImpl(connection())
->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
}
MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
@@ -177,17 +208,16 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
(*expectedInboundToken_)) {
if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
}
}
}
MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
: semaphore_(semaphore),
expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
: semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
if (connection().localDevice().type != DeviceType::GPU) {
throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
THROW(CONN, Error, ErrorCode::InvalidUsage,
"Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
}
}
@@ -202,7 +232,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe
device.remoteInboundToken = reinterpret_cast<uint64_t*>(semaphore_.remoteMemory().data());
device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
device.expectedInboundToken = expectedInboundToken_.get();
device.outboundToken = outboundToken_.get();
return device;
};

View File

@@ -183,7 +183,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] DataType accumDtype) -> CommResult {
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,

View File

@@ -212,7 +212,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
const std::unordered_map<std::string, uintptr_t>& extras,
[[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
},
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,

View File

@@ -2,6 +2,7 @@
// Licensed under the MIT license.
#include <collective_utils.hpp>
#include <type_traits>
#include "allreduce/allreduce_allpair_packet.hpp"
#include "allreduce/common.hpp"
@@ -11,7 +12,7 @@
namespace mscclpp {
namespace collective {
template <ReduceOp OpType, typename T>
template <ReduceOp OpType, typename T, typename AccumT = T>
__global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
@@ -43,13 +44,16 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
// step 2: Reduce Data
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
uint32_t data = src[idx];
using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
for (int index = 0; index < nPeers; index++) {
const int remoteRank = index < rank ? index : index + 1;
LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
uint32_t val = dstPkt[idx].read(flag, -1);
data = cal_vector<T, OpType>(val, data);
acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
}
dst[idx] = data;
dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
}
__syncthreads();
if (threadIdx.x == 0) {
@@ -67,7 +71,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
return {(worldSize - 1) * 4, 512};
}
template <ReduceOp OpType, typename T>
template <ReduceOp OpType, typename T, typename AccumT = T>
struct AllpairAdapter {
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
@@ -76,7 +80,12 @@ struct AllpairAdapter {
int nThreadsPerBlock = 0) {
using ChannelType = DeviceHandle<MemoryChannel>;
const size_t nelems = inputSize / sizeof(T);
allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
// Round nBlocks to multiple of nPeers so every block maps to a valid peer.
const int nPeers = worldSize - 1;
if (nPeers > 0) {
nBlocks = (nBlocks / nPeers) * nPeers;
}
allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
return cudaGetLastError();
@@ -94,18 +103,24 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
const std::unordered_map<std::string, uintptr_t>&) {
const std::unordered_map<std::string, uintptr_t>&,
DataType accumDtype) {
auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
}
// nBlocks must be at least nPeers for allpair — each block maps to one peer.
const int nPeers = algoCtx->nRanksPerNode - 1;
if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
return CommResult::CommInvalidArgument;
}
size_t sendBytes;
CUdeviceptr sendBasePtr;
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
size_t channelInOffset = (char*)input - (char*)sendBasePtr;
AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
if (!allreduce) {
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
return CommResult::CommInvalidArgument;
@@ -161,9 +176,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
[self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
extras);
extras, accumDtype);
},
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
[[maybe_unused]] size_t outputSize,

Some files were not shown because too many files have changed in this diff Show More