mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
Merge branch 'main' into binyli/unique-qp-and-gid-index
Resolve conflicts in env.hpp, env_py.cpp, and env.cpp by combining both branches' additions: keep main's MSCCLPP_FORCE_DISABLE_GDR field and this branch's -1 sentinel default for MSCCLPP_IB_GID_INDEX. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
93
.azure-pipelines/codecov.yml
Normal file
93
.azure-pipelines/codecov.yml
Normal file
@@ -0,0 +1,93 @@
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: CodeCoverageA100
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci
|
||||
variables:
|
||||
- group: mscclpp
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
gpuArch: '80'
|
||||
|
||||
- job: CodeCoverageH100
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
variables:
|
||||
- group: mscclpp
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
gpuArch: '90'
|
||||
|
||||
- job: CodeCoverageMI300X
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci-mi300x
|
||||
variables:
|
||||
- group: mscclpp
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6_2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
@@ -41,11 +41,10 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: IntegrationTestH100
|
||||
@@ -61,10 +60,9 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -16,168 +16,109 @@ pr: none
|
||||
|
||||
|
||||
parameters:
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-multinode-ci
|
||||
- name: hostEntries
|
||||
type: string
|
||||
default: |
|
||||
10.0.0.10 mscclit-000000
|
||||
10.0.0.11 mscclit-000001
|
||||
10.0.0.5 mscclpp-h100-multinode-ci000000
|
||||
10.0.0.4 mscclpp-h100-multinode-ci000001
|
||||
|
||||
jobs:
|
||||
- job: MultiNodesTest
|
||||
displayName: Multi nodes test
|
||||
strategy:
|
||||
matrix:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
pool:
|
||||
name: mscclpp-it
|
||||
name: mscclpp-multi-node
|
||||
container:
|
||||
image: $[ variables['containerImage'] ]
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: mscclpp-ssh.key
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: Bash@3
|
||||
displayName: Add HostEntry
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
ENTRY="${{ parameters.hostEntries }}"
|
||||
if ! grep -qxF "$ENTRY" /etc/hosts; then
|
||||
echo "Adding to /etc/hosts"
|
||||
echo "$ENTRY" | sudo tee -a /etc/hosts
|
||||
else
|
||||
echo "Entry already exists, nothing to do."
|
||||
fi
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: msccl-it
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name mscclit-vmss --resource-group msccl-IT
|
||||
while IFS= read -r line; do
|
||||
[ -z "$line" ] && continue
|
||||
if ! grep -qxF "$line" /etc/hosts; then
|
||||
echo "Adding to /etc/hosts: $line"
|
||||
echo "$line" | sudo tee -a /etc/hosts
|
||||
else
|
||||
echo "Entry already exists: $line"
|
||||
fi
|
||||
done <<< "${{ parameters.hostEntries }}"
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
displayName: Generate deploy files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
|
||||
kill $CHILD_PID
|
||||
VMSS="${{ parameters.vmssName }}"
|
||||
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
|
||||
NODE0="${VMSS}000000"
|
||||
NODE1="${VMSS}000001"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
|
||||
kill $CHILD_PID
|
||||
echo "Host ${NODE0}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host ${NODE1}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
|
||||
kill $CHILD_PID
|
||||
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
|
||||
|
||||
- task: Bash@3
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
rm -rf output/*
|
||||
mkdir -p output
|
||||
touch output/mscclit-000000
|
||||
tail -f output/mscclit-000000 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
|
||||
kill $CHILD_PID
|
||||
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: msccl-it
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
gpuArch: '90'
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
continueOnError: true
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh pytests
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
|
||||
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
|
||||
@@ -40,11 +40,11 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/nccl-test.yaml
|
||||
- template: templates/nccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
- job: NcclTestH100
|
||||
@@ -61,9 +61,9 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/nccl-test.yaml
|
||||
- template: templates/nccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
nvccGencode: "-gencode=arch=compute_90,code=sm_90"
|
||||
@@ -40,9 +40,8 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/rccl-test.yaml
|
||||
- template: templates/rccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: gfx942
|
||||
|
||||
110
.azure-pipelines/templates/codecov.yml
Normal file
110
.azure-pipelines/templates/codecov.yml
Normal file
@@ -0,0 +1,110 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
buildType: Debug
|
||||
cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON'
|
||||
buildDisplayName: 'Build with coverage'
|
||||
buildName: BuildCoverage
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: TestsCoverageNonPerf
|
||||
displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
|
||||
remoteScript: |
|
||||
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
|
||||
STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c)
|
||||
export GCOV_PREFIX=/root/mscclpp
|
||||
export GCOV_PREFIX_STRIP=$STRIP_COUNT
|
||||
|
||||
echo "Running unit_tests..."
|
||||
./build/bin/unit_tests
|
||||
echo "unit_tests: PASSED"
|
||||
|
||||
echo "Running mp_unit_tests -np 2..."
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests
|
||||
echo "mp_unit_tests -np 2: PASSED"
|
||||
|
||||
echo "Running mp_unit_tests -np 4..."
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
|
||||
echo "mp_unit_tests -np 4: PASSED"
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: CaptureCoverage
|
||||
displayName: Capture coverage data with lcov
|
||||
remoteScript: |
|
||||
BUILD_PREFIX=$(cat build/BUILD_PREFIX)
|
||||
|
||||
GCOV_TOOL_ARG=""
|
||||
if [ "${{ parameters.platform }}" = "rocm" ]; then
|
||||
apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1
|
||||
GCOV_WRAPPER=$(mktemp)
|
||||
printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER"
|
||||
chmod +x "$GCOV_WRAPPER"
|
||||
GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}"
|
||||
fi
|
||||
|
||||
lcov --version
|
||||
LCOV_CAPTURE_ARGS=""
|
||||
if lcov --help 2>&1 | grep -q "inconsistent"; then
|
||||
LCOV_CAPTURE_ARGS="--ignore-errors inconsistent"
|
||||
fi
|
||||
|
||||
lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS}
|
||||
if [ ! -s coverage.info ]; then
|
||||
echo "ERROR: coverage.info was not generated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info
|
||||
lcov --list coverage.info
|
||||
ls -la coverage.info
|
||||
|
||||
- task: Bash@3
|
||||
name: FetchCoverage
|
||||
displayName: Fetch coverage data from remote VM
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
HOST=$(head -1 ${HOSTFILE})
|
||||
ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \
|
||||
'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info'
|
||||
scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: UploadCodecov
|
||||
displayName: Upload coverage to Codecov
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
curl -Os https://cli.codecov.io/latest/linux/codecov
|
||||
chmod +x codecov
|
||||
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
151
.azure-pipelines/templates/deploy.yml
Normal file
151
.azure-pipelines/templates/deploy.yml
Normal file
@@ -0,0 +1,151 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: resourceGroup
|
||||
type: string
|
||||
default: mscclpp
|
||||
# Build parameters
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildType
|
||||
type: string
|
||||
default: 'Release'
|
||||
- name: buildTests
|
||||
type: string
|
||||
default: 'true'
|
||||
- name: cmakeArgs
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildName
|
||||
type: string
|
||||
default: 'Build'
|
||||
- name: buildDisplayName
|
||||
type: string
|
||||
default: 'Build'
|
||||
# Deploy parameters
|
||||
- name: deployArgs
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
steps:
|
||||
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
|
||||
- task: Bash@3
|
||||
name: EnsureAzureCLI
|
||||
displayName: Ensure Azure CLI Installed
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
if command -v az >/dev/null 2>&1; then
|
||||
az version >/dev/null
|
||||
exit 0
|
||||
fi
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
# 1. Build
|
||||
- task: Bash@3
|
||||
name: ${{ parameters.buildName }}
|
||||
displayName: ${{ parameters.buildDisplayName }}
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
rm -rf build
|
||||
mkdir -p build && cd build
|
||||
BUILD_TESTS_ARG=""
|
||||
if [ "${{ parameters.buildTests }}" = "true" ]; then
|
||||
BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON"
|
||||
fi
|
||||
|
||||
GPU_ARCH_ARG=""
|
||||
if [ -n "${{ parameters.gpuArch }}" ]; then
|
||||
GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
|
||||
if [ "${{ parameters.platform }}" = "rocm" ]; then
|
||||
eval CXX=/opt/rocm/bin/hipcc cmake \
|
||||
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_ROCM=ON \
|
||||
${BUILD_TESTS_ARG} \
|
||||
${GPU_ARCH_ARG} \
|
||||
${CMAKE_EXTRA_ARGS} ..
|
||||
else
|
||||
eval cmake \
|
||||
-DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_CUDA=ON \
|
||||
${BUILD_TESTS_ARG} \
|
||||
${GPU_ARCH_ARG} \
|
||||
${CMAKE_EXTRA_ARGS} ..
|
||||
fi
|
||||
make -j
|
||||
cd ..
|
||||
pwd > build/BUILD_PREFIX
|
||||
echo "=== Build artifacts ==="
|
||||
ls -la build/bin/ || echo "ERROR: build/bin/ missing after build"
|
||||
du -sh build/bin/* 2>/dev/null || true
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# 2. Write CMake args for pip install on remote VMs
|
||||
- task: Bash@3
|
||||
name: WritePipCmakeArgs
|
||||
displayName: Write pip CMake args
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
PIP_CMAKE_ARGS=""
|
||||
if [ -n "${{ parameters.gpuArch }}" ]; then
|
||||
PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}"
|
||||
fi
|
||||
CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}'
|
||||
if [ -n "${CMAKE_EXTRA_ARGS}" ]; then
|
||||
PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}"
|
||||
fi
|
||||
echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt
|
||||
echo "pip CMake args: $(cat pip_cmake_args.txt)"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# 3. Download SSH key + install packages + start VMSS
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: mscclpp.pem
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
|
||||
|
||||
# 4. Deploy test environment
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: ${{ parameters.deployArgs }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
@@ -1,242 +0,0 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllGatherTest
|
||||
displayName: Run mscclpp AllGather test
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
set -e; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: SendRecvTest
|
||||
displayName: Run mscclpp SendRecv test
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp AllReduce test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllToAll
|
||||
displayName: Run mscclpp AllToAll test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: CheckPerfNumber
|
||||
displayName: Check collective primitives performance
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: PythonAllReduceBenchmark
|
||||
displayName: Python Allreduce Benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 -m pip install .; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: FifoPerfBenchmark
|
||||
displayName: FIFO Performance Benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
./build/bin/perf/fifo_test"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
76
.azure-pipelines/templates/integration-test.yml
Normal file
76
.azure-pipelines/templates/integration-test.yml
Normal file
@@ -0,0 +1,76 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllGatherTest
|
||||
displayName: Run mscclpp AllGather test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: SendRecvTest
|
||||
displayName: Run mscclpp SendRecv test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp AllReduce test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllToAll
|
||||
displayName: Run mscclpp AllToAll test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: CheckPerfNumber
|
||||
displayName: Check collective primitives performance
|
||||
remoteScript: |
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PythonAllReduceBenchmark
|
||||
displayName: Python Allreduce Benchmark
|
||||
remoteScript: |
|
||||
python3 -m pip install .
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,282 +0,0 @@
|
||||
# .azure-pipelines/templates/nccl-test.yaml
|
||||
# ----------------------------------------
|
||||
# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
|
||||
#
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
# sshKeySecureFile – the secureFile name for your SSH key
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: nvccGencode
|
||||
type: string
|
||||
default: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
- checkout: git://One/msccl-users
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: mscclpp/test/deploy/deploy.sh
|
||||
arguments: nccltest-single-node
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: CopyMscclUsers
|
||||
displayName: Copy msccl-users
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
DST_DIR="/tmp/mscclpp/msccl-users"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: GenerateExecutionFile
|
||||
# displayName: Generate execution file
|
||||
# inputs:
|
||||
# targetType: 'inline'
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp/msccl-users; \
|
||||
# mkdir -p execution-files; \
|
||||
# cd /root/mscclpp/msccl-users; \
|
||||
# bash algos/mscclpp_a100/generate_execution_plan.sh"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallNcclTests
|
||||
displayName: Install NCCL Tests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
|
||||
cd nccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclAllReduceTest
|
||||
# displayName: Run NCCL AllReduce Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclAllGatherTest
|
||||
# displayName: Run NCCL AllGather Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclReduceScatterTest
|
||||
# displayName: Run NCCL Reduce Scatter Test
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallNccl
|
||||
displayName: Install NCCL
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \
|
||||
if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \
|
||||
cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \
|
||||
cd nccl; \
|
||||
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunNcclAllGatherFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunNcclAllReduceFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunNcclBroadcastFallbaclkToNcclTest
|
||||
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
# - task: Bash@3
|
||||
# name: RunNcclReduceScatterFallbaclkToNcclTest
|
||||
# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation
|
||||
# inputs:
|
||||
# targetType: 'inline'
|
||||
# script: |
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
|
||||
# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
# cd /root/mscclpp; \
|
||||
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \
|
||||
# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
80
.azure-pipelines/templates/nccl-test.yml
Normal file
80
.azure-pipelines/templates/nccl-test.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
# .azure-pipelines/templates/nccl-test.yml
|
||||
# ----------------------------------------
|
||||
# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
|
||||
#
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: '80'
|
||||
- name: nvccGencode
|
||||
type: string
|
||||
default: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'nccltest-single-node'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallNcclTests
|
||||
displayName: Install NCCL Tests
|
||||
remoteScript: |
|
||||
cd
|
||||
git clone https://github.com/NVIDIA/nccl-tests.git
|
||||
cd nccl-tests
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallNccl
|
||||
displayName: Install NCCL
|
||||
remoteScript: |
|
||||
LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4)
|
||||
if [ -z "$LATEST_TAG" ]; then
|
||||
echo "Failed to fetch latest NCCL tag"
|
||||
exit 1
|
||||
fi
|
||||
cd
|
||||
git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git
|
||||
cd nccl
|
||||
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclAllGatherFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
|
||||
remoteScript: |
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclAllReduceFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
|
||||
remoteScript: |
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclBroadcastFallbaclkToNcclTest
|
||||
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
|
||||
remoteScript: |
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,142 +0,0 @@
|
||||
# .azure-pipelines/templates/rccl-test.yaml
|
||||
# ------------------------------------------------
|
||||
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
|
||||
#
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
# vmssName – VMSS name to start/stop
|
||||
# sshKeySecureFile – the secureFile name for your SSH key
|
||||
# gpuArch – GPU architecture (e.g. gfx942)
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: "gfx942"
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test true rocm"
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallRcclTests
|
||||
displayName: Install RCCL Tests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd; \
|
||||
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \
|
||||
cd rocm-systems; \
|
||||
git sparse-checkout init --cone; \
|
||||
git sparse-checkout set projects/rccl-tests; \
|
||||
git checkout; \
|
||||
cd projects/rccl-tests; \
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunRcclAllGatherTest
|
||||
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: RunRcclAllReduceTest
|
||||
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
ROOT_DIR=$(System.DefaultWorkingDirectory)
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \
|
||||
echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
63
.azure-pipelines/templates/rccl-test.yml
Normal file
63
.azure-pipelines/templates/rccl-test.yml
Normal file
@@ -0,0 +1,63 @@
|
||||
# .azure-pipelines/templates/rccl-test.yml
|
||||
# ------------------------------------------------
|
||||
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
|
||||
#
|
||||
# Parameters:
|
||||
# subscription – Azure subscription to use for VMSS start/stop
|
||||
# vmssName – VMSS name to start/stop
|
||||
# gpuArch – GPU architecture (e.g. gfx942)
|
||||
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: "gfx942"
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: rocm
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
buildTests: false
|
||||
deployArgs: 'single-node-test true rocm'
|
||||
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallRcclTests
|
||||
displayName: Install RCCL Tests
|
||||
remoteScript: |
|
||||
cd
|
||||
git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git
|
||||
cd rocm-systems
|
||||
git sparse-checkout init --cone
|
||||
git sparse-checkout set projects/rccl-tests
|
||||
git checkout
|
||||
cd projects/rccl-tests
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunRcclAllGatherTest
|
||||
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
|
||||
remoteScript: |
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunRcclAllReduceTest
|
||||
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
|
||||
remoteScript: |
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
31
.azure-pipelines/templates/run-remote-task.yml
Normal file
31
.azure-pipelines/templates/run-remote-task.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
parameters:
|
||||
- name: name
|
||||
type: string
|
||||
default: ''
|
||||
- name: displayName
|
||||
type: string
|
||||
- name: runRemoteArgs
|
||||
type: string
|
||||
default: ''
|
||||
- name: remoteScript
|
||||
type: string
|
||||
- name: workingDirectory
|
||||
type: string
|
||||
default: '$(System.DefaultWorkingDirectory)'
|
||||
- name: continueOnError
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
${{ if ne(parameters.name, '') }}:
|
||||
name: ${{ parameters.name }}
|
||||
displayName: ${{ parameters.displayName }}
|
||||
continueOnError: ${{ parameters.continueOnError }}
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD'
|
||||
${{ parameters.remoteScript }}
|
||||
REMOTE_CMD
|
||||
workingDirectory: ${{ parameters.workingDirectory }}
|
||||
20
.azure-pipelines/templates/stop.yml
Normal file
20
.azure-pipelines/templates/stop.yml
Normal file
@@ -0,0 +1,20 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: resourceGroup
|
||||
type: string
|
||||
default: mscclpp
|
||||
|
||||
steps:
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }}
|
||||
42
.azure-pipelines/templates/ut-executor.yml
Normal file
42
.azure-pipelines/templates/ut-executor.yml
Normal file
@@ -0,0 +1,42 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: ExecutorTest
|
||||
displayName: Run executor tests
|
||||
remoteScript: |
|
||||
python3 -m pip install .
|
||||
PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
|
||||
TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
|
||||
mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,191 +0,0 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: single-node-test false
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
|
||||
- task: Bash@3
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
./build/bin/unit_tests"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: StopContainer
|
||||
displayName: Stop existing container
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true"
|
||||
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: BuildWithIb
|
||||
displayName: Rebuild with IB
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
rm -rf build && mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnvWithIb
|
||||
displayName: Deploy Test Env (with IB build)
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: single-node-test false
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
|
||||
- task: Bash@3
|
||||
name: PyTestsWithIbBuildDisableIb
|
||||
displayName: Run pytests (IB build, IB tests disabled)
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
95
.azure-pipelines/templates/ut-no-ib-env.yml
Normal file
95
.azure-pipelines/templates/ut-no-ib-env.yml
Normal file
@@ -0,0 +1,95 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
|
||||
deployArgs: 'single-node-test false'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
remoteScript: |
|
||||
./build/bin/unit_tests
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: StopContainer
|
||||
displayName: Stop existing container
|
||||
runRemoteArgs: '--no-docker --no-log'
|
||||
remoteScript: |
|
||||
sudo docker stop mscclpp-test || true
|
||||
sudo docker rm mscclpp-test || true
|
||||
|
||||
- task: Bash@3
|
||||
displayName: Remove generated SSH key files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: BuildWithIb
|
||||
displayName: Rebuild with IB
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
rm -rf build
|
||||
mkdir -p build && cd build
|
||||
cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DMSCCLPP_BYPASS_GPU_CHECK=ON \
|
||||
-DMSCCLPP_USE_CUDA=ON \
|
||||
-DMSCCLPP_BUILD_TESTS=ON \
|
||||
-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnvWithIb
|
||||
displayName: Deploy Test Env (with IB build)
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: single-node-test false
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTestsWithIbBuildDisableIb
|
||||
displayName: Run pytests (IB build, IB tests disabled)
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,145 +0,0 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
|
||||
steps:
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
mkdir -p build && cd build; \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
|
||||
make -j"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
# set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \
|
||||
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
57
.azure-pipelines/templates/ut-npkit.yml
Normal file
57
.azure-pipelines/templates/ut-npkit.yml
Normal file
@@ -0,0 +1,57 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
remoteScript: |
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce"
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
remoteScript: |
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
|
||||
export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'
|
||||
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output
|
||||
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,142 +0,0 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
if [ "${{ parameters.platform }}" == "rocm" ]; then
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
else
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
fi
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test true ${{ parameters.platform }}"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
- task: Bash@3
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
./build/bin/unit_tests"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
cd /root/mscclpp; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
49
.azure-pipelines/templates/ut.yml
Normal file
49
.azure-pipelines/templates/ut.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
parameters:
|
||||
- name: subscription
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: platform
|
||||
type: string
|
||||
default: 'cuda'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
platform: ${{ parameters.platform }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
remoteScript: |
|
||||
./build/bin/unit_tests
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,50 +0,0 @@
|
||||
trigger:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
- release/*
|
||||
drafts: false
|
||||
paths:
|
||||
exclude:
|
||||
- .devcontainer/**
|
||||
- .github/**
|
||||
- apps/**
|
||||
- docker/**
|
||||
- docs/**
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: UnitTestMI300X
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci-mi300x
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6_2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
@@ -37,17 +37,16 @@ jobs:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: UnitTestWithNpKitA100
|
||||
@@ -59,17 +58,16 @@ jobs:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-npkit.yaml
|
||||
- template: templates/ut-npkit.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: UnitTestH100
|
||||
@@ -79,17 +77,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestWithNpKitH100
|
||||
@@ -99,17 +96,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-npkit.yaml
|
||||
- template: templates/ut-npkit.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestNoIBEnv
|
||||
@@ -121,15 +117,55 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-no-ib-env.yaml
|
||||
- template: templates/ut-no-ib-env.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '90'
|
||||
|
||||
- job: UnitTestMI300X
|
||||
timeoutInMinutes: 40
|
||||
pool:
|
||||
name: msccl-ci-mi300x
|
||||
strategy:
|
||||
matrix:
|
||||
rocm6_2:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
platform: rocm
|
||||
gpuArch: gfx942
|
||||
|
||||
- job: UnitTestExecutor
|
||||
timeoutInMinutes: 60
|
||||
displayName: Test DSL Executor
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-executor.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
gpuArch: '90'
|
||||
24
.codecov.yml
Normal file
24
.codecov.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
codecov:
|
||||
require_ci_to_pass: yes
|
||||
|
||||
coverage:
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
target: 68%
|
||||
threshold: 1%
|
||||
patch:
|
||||
default:
|
||||
target: 80%
|
||||
|
||||
flag_management:
|
||||
default_rules:
|
||||
carryforward: true
|
||||
|
||||
ignore:
|
||||
- "test/"
|
||||
- "examples/"
|
||||
- "python/"
|
||||
- "tools/"
|
||||
- "docs/"
|
||||
- "docker/"
|
||||
2
.github/copilot-instructions.md
vendored
2
.github/copilot-instructions.md
vendored
@@ -43,7 +43,7 @@ For testing after successful build:
|
||||
# To run tests with two GPUs - two is enough for most tests
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests
|
||||
# To run tests excluding IB-related ones (when IB is not available)
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib*
|
||||
mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib*
|
||||
```
|
||||
|
||||
For building a Python package:
|
||||
|
||||
6
.github/workflows/codeql-analysis.yml
vendored
6
.github/workflows/codeql-analysis.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
version: [ 'cuda11.8', 'cuda12.8' ]
|
||||
version: [ 'cuda11.8', 'cuda12.9' ]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
rm -rf build && mkdir build && cd build
|
||||
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
|
||||
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF ..
|
||||
make -j4
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
@@ -107,7 +107,7 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
rm -rf build && mkdir build && cd build
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF ..
|
||||
make -j4
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
|
||||
69
.github/workflows/integration-test-backup.yml
vendored
69
.github/workflows/integration-test-backup.yml
vendored
@@ -1,69 +0,0 @@
|
||||
name: IntegrationTest
|
||||
|
||||
on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
IntegrationTest:
|
||||
runs-on: [ self-hosted, A100 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: [ cuda11.8, cuda12.2 ]
|
||||
|
||||
container:
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
|
||||
- name: Lock GPU clock frequency
|
||||
run: |
|
||||
sudo nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
||||
done
|
||||
|
||||
- name: Run mscclpp AllGather test
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp SendRecv test
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp AllReduce test
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp AllToAll test
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
|
||||
- name: Check collective primitives performance
|
||||
run: |
|
||||
set -e
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
|
||||
2
.github/workflows/mscclpp-lang.yml
vendored
2
.github/workflows/mscclpp-lang.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ 'cuda11.8', 'cuda12.8' ]
|
||||
version: [ 'cuda11.8', 'cuda12.9' ]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
52
.github/workflows/ut-backup.yml
vendored
52
.github/workflows/ut-backup.yml
vendored
@@ -1,52 +0,0 @@
|
||||
name: UnitTest
|
||||
|
||||
on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
UnitTest:
|
||||
runs-on: [ self-hosted, A100 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: [ cuda11.8, cuda12.2 ]
|
||||
|
||||
container:
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: LockGPUClock
|
||||
run: |
|
||||
sudo nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
||||
done
|
||||
|
||||
- name: UnitTests
|
||||
run: |
|
||||
./build/bin/unit_tests
|
||||
|
||||
- name: MpUnitTests
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
|
||||
|
||||
- name: PyTests
|
||||
run: |
|
||||
set -e
|
||||
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
||||
.vscode/
|
||||
build/
|
||||
build_coverage/
|
||||
__pycache__
|
||||
.*.swp
|
||||
*.so
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
cmake_minimum_required(VERSION 3.25)
|
||||
project(mscclpp LANGUAGES CXX)
|
||||
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
|
||||
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
|
||||
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
|
||||
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
|
||||
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
|
||||
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
|
||||
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
|
||||
|
||||
@@ -99,6 +100,62 @@ else()
|
||||
message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Code coverage setup
|
||||
if(MSCCLPP_ENABLE_COVERAGE)
|
||||
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
|
||||
message(STATUS "Code coverage enabled")
|
||||
|
||||
# Add coverage flags to C++ targets only (not CUDA)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:--coverage>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O0>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-g>)
|
||||
add_link_options($<$<LINK_LANGUAGE:CXX>:--coverage>)
|
||||
|
||||
# Find lcov
|
||||
find_program(LCOV_PATH lcov)
|
||||
|
||||
if(NOT LCOV_PATH)
|
||||
message(WARNING "lcov not found. Install lcov to generate coverage reports.")
|
||||
endif()
|
||||
|
||||
if(LCOV_PATH)
|
||||
# Add coverage target
|
||||
add_custom_target(coverage
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..."
|
||||
COMMAND ${LCOV_PATH} --directory . --zerocounters
|
||||
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Running tests..."
|
||||
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
|
||||
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..."
|
||||
COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info
|
||||
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..."
|
||||
COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info
|
||||
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info"
|
||||
|
||||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
|
||||
COMMENT "Generating code coverage report"
|
||||
)
|
||||
|
||||
# Add coverage clean target
|
||||
add_custom_target(coverage-clean
|
||||
COMMAND ${CMAKE_COMMAND} -E remove coverage.info
|
||||
COMMAND ${LCOV_PATH} --directory . --zerocounters
|
||||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
|
||||
COMMENT "Cleaning coverage data"
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "Code coverage is only supported with GCC or Clang compilers")
|
||||
endif()
|
||||
endif()
|
||||
if(MSCCLPP_GPU_ARCHS)
|
||||
string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS)
|
||||
string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}")
|
||||
@@ -167,9 +224,30 @@ if(MSCCLPP_USE_IB)
|
||||
if(NOT IBVERBS_FOUND)
|
||||
message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
|
||||
endif()
|
||||
find_package(MLX5)
|
||||
if(MLX5_FOUND)
|
||||
message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}")
|
||||
else()
|
||||
message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled")
|
||||
endif()
|
||||
endif()
|
||||
find_package(NUMA REQUIRED)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
|
||||
if(MSCCLPP_USE_ROCM)
|
||||
set(MSCCLPP_USE_GDRCOPY OFF)
|
||||
endif()
|
||||
if(MSCCLPP_USE_GDRCOPY)
|
||||
find_package(GDRCopy)
|
||||
if(NOT GDRCOPY_FOUND)
|
||||
message(STATUS "GDRCopy not found, disabling GDRCopy support")
|
||||
set(MSCCLPP_USE_GDRCOPY OFF)
|
||||
else()
|
||||
message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(json
|
||||
GIT_REPOSITORY https://github.com/nlohmann/json.git
|
||||
|
||||
13
README.md
13
README.md
@@ -3,15 +3,16 @@
|
||||
[](https://github.com/microsoft/mscclpp/releases/latest)
|
||||
[](LICENSE)
|
||||
[](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
|
||||
[](https://microsoft.github.io/mscclpp/)
|
||||
[](https://microsoft.github.io/mscclpp/)
|
||||
[](https://codecov.io/gh/microsoft/mscclpp)
|
||||
|
||||
| Testing Pipelines | Build Status |
|
||||
|--------------------------|-------------------|
|
||||
| Unit Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Integration Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
|
||||
| Unit Tests (ROCm) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
|
||||
| NCCL Tests | [](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
|
||||
| RCCL Tests | [](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
|
||||
| Unit Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Unit Tests (ROCm) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Integration Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
|
||||
| NCCL Tests | [)](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
|
||||
| RCCL Tests | [)](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
|
||||
|
||||
A GPU-driven communication stack for scalable AI applications.
|
||||
|
||||
|
||||
50
cmake/FindGDRCopy.cmake
Normal file
50
cmake/FindGDRCopy.cmake
Normal file
@@ -0,0 +1,50 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE)
|
||||
#
|
||||
# The following variables are optionally searched for defaults
|
||||
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
|
||||
# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
|
||||
# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found
|
||||
|
||||
# The following are set after configuration is done:
|
||||
# GDRCOPY_FOUND
|
||||
# GDRCOPY_INCLUDE_DIRS
|
||||
# GDRCOPY_LIBRARIES
|
||||
|
||||
find_path(GDRCOPY_INCLUDE_DIRS
|
||||
NAMES gdrapi.h
|
||||
HINTS
|
||||
${GDRCOPY_INCLUDE_DIR}
|
||||
${GDRCOPY_ROOT_DIR}
|
||||
${GDRCOPY_ROOT_DIR}/include
|
||||
/usr/local/include
|
||||
/usr/include)
|
||||
|
||||
find_library(GDRCOPY_LIBRARIES
|
||||
NAMES gdrapi
|
||||
HINTS
|
||||
${GDRCOPY_LIB_DIR}
|
||||
${GDRCOPY_ROOT_DIR}
|
||||
${GDRCOPY_ROOT_DIR}/lib
|
||||
/usr/local/lib
|
||||
/usr/lib
|
||||
/usr/lib/x86_64-linux-gnu)
|
||||
|
||||
if(GDRCOPY_INCLUDE_DIRS)
|
||||
include(CheckSymbolExists)
|
||||
set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS})
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES})
|
||||
check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
unset(CMAKE_REQUIRED_INCLUDES)
|
||||
if(NOT GDRCOPY_HAS_PIN_BUFFER_V2)
|
||||
message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.")
|
||||
set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
|
||||
mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
|
||||
38
cmake/FindMLX5.cmake
Normal file
38
cmake/FindMLX5.cmake
Normal file
@@ -0,0 +1,38 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Find the MLX5 Direct Verbs (mlx5dv) library
|
||||
#
|
||||
# The following variables are optionally searched for defaults
|
||||
# MLX5_ROOT_DIR: Base directory where all MLX5 components are found
|
||||
# MLX5_INCLUDE_DIR: Directory where MLX5 headers are found
|
||||
# MLX5_LIB_DIR: Directory where MLX5 libraries are found
|
||||
|
||||
# The following are set after configuration is done:
|
||||
# MLX5_FOUND
|
||||
# MLX5_INCLUDE_DIRS
|
||||
# MLX5_LIBRARIES
|
||||
|
||||
find_path(MLX5_INCLUDE_DIRS
|
||||
NAMES infiniband/mlx5dv.h
|
||||
HINTS
|
||||
${MLX5_INCLUDE_DIR}
|
||||
${MLX5_ROOT_DIR}
|
||||
${MLX5_ROOT_DIR}/include
|
||||
/usr/local/include
|
||||
/usr/include)
|
||||
|
||||
find_library(MLX5_LIBRARIES
|
||||
NAMES mlx5
|
||||
HINTS
|
||||
${MLX5_LIB_DIR}
|
||||
${MLX5_ROOT_DIR}
|
||||
${MLX5_ROOT_DIR}/lib
|
||||
/usr/local/lib
|
||||
/usr/lib
|
||||
/usr/lib/x86_64-linux-gnu)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
|
||||
mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES)
|
||||
@@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
htop \
|
||||
lcov \
|
||||
vim \
|
||||
&& \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
# Install lcov 2.2
|
||||
RUN LCOV_VERSION="2.2" && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cpanminus \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
&& \
|
||||
cpanm --notest \
|
||||
Capture::Tiny \
|
||||
DateTime \
|
||||
JSON::XS \
|
||||
Memory::Process \
|
||||
TimeDate \
|
||||
&& \
|
||||
cd /tmp && \
|
||||
curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \
|
||||
tar xzf lcov.tar.gz && \
|
||||
cd lcov-${LCOV_VERSION} && \
|
||||
make install && \
|
||||
cd / && rm -rf /tmp/lcov* && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
# Install CMake 3.26.4
|
||||
RUN OS_ARCH=$(uname -m) && \
|
||||
CMAKE_VERSION="3.26.4" && \
|
||||
@@ -24,8 +49,25 @@ RUN OS_ARCH=$(uname -m) && \
|
||||
rm -rf ${CMAKE_HOME}.tar.gz && \
|
||||
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
|
||||
|
||||
# Install ROCm-specific packages if building for ROCm
|
||||
# Install GDRCopy userspace library for CUDA targets
|
||||
ARG TARGET="cuda13.0"
|
||||
RUN if echo "$TARGET" | grep -q "^cuda"; then \
|
||||
GDRCOPY_VERSION="2.5.2" && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
|
||||
cd /tmp && \
|
||||
curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
|
||||
tar xzf gdrcopy.tar.gz && \
|
||||
cd gdrcopy-${GDRCOPY_VERSION}/packages && \
|
||||
./build-deb-packages.sh -k -t && \
|
||||
dpkg -i libgdrapi_*.deb && \
|
||||
cd / && rm -rf /tmp/gdrcopy* && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*; \
|
||||
fi
|
||||
|
||||
# Install ROCm-specific packages if building for ROCm
|
||||
RUN if echo "$TARGET" | grep -q "^rocm"; then \
|
||||
apt-get update -y && \
|
||||
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
|
||||
@@ -47,7 +89,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
|
||||
export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \
|
||||
fi && \
|
||||
pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r python/requirements_${target_type}.txt
|
||||
pip install --no-cache-dir -r python/requirements_${target_type}.txt && \
|
||||
pip install --no-cache-dir coverage xlsxwriter
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /tmp/mscclpp
|
||||
|
||||
@@ -4,27 +4,22 @@ set -e
|
||||
|
||||
declare -A baseImageTable
|
||||
baseImageTable=(
|
||||
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
|
||||
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
|
||||
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
|
||||
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
|
||||
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04"
|
||||
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
||||
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
|
||||
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
|
||||
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04"
|
||||
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
|
||||
["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2"
|
||||
)
|
||||
|
||||
declare -A extraLdPathTable
|
||||
extraLdPathTable=(
|
||||
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
|
||||
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
|
||||
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
|
||||
["rocm6.2"]="/opt/rocm/lib"
|
||||
)
|
||||
|
||||
declare -A ofedVersionTable
|
||||
ofedVersionTable=(
|
||||
["cuda11.8"]="23.07-0.5.1.2"
|
||||
["cuda12.4"]="23.07-0.5.1.2"
|
||||
["cuda12.8"]="24.10-1.1.4.0"
|
||||
["cuda12.9"]="24.10-1.1.4.0"
|
||||
@@ -36,7 +31,7 @@ TARGET=${1}
|
||||
OS_ARCH=$(uname -m)
|
||||
|
||||
print_usage() {
|
||||
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
|
||||
echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
|
||||
}
|
||||
|
||||
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SPHINXMULTIVERSION ?= sphinx-multiversion
|
||||
SPHINXMULTIVERSION ?= python3 build_multiversion.py
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
|
||||
49
docs/build_multiversion.py
Normal file
49
docs/build_multiversion.py
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Wrapper around sphinx-multiversion that patches copy_tree to generate
|
||||
_version.py in each tag checkout. This is needed because setuptools_scm
|
||||
generates _version.py at build time, but sphinx-multiversion uses
|
||||
`git archive` which only contains committed files.
|
||||
|
||||
Usage (called by Makefile):
|
||||
python3 build_multiversion.py <sourcedir> <outputdir> [sphinx-opts...]
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import sphinx_multiversion.git as smv_git
|
||||
from sphinx_multiversion import main as smv_main
|
||||
|
||||
# Save the original copy_tree
|
||||
_original_copy_tree = smv_git.copy_tree
|
||||
|
||||
|
||||
def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."):
|
||||
"""Call original copy_tree, then generate _version.py from the VERSION file."""
|
||||
_original_copy_tree(gitroot, src, dst, reference, sourcepath)
|
||||
|
||||
# Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0")
|
||||
refname = getattr(reference, "refname", "") or ""
|
||||
match = re.search(r"v(\d+\.\d+\.\d+)", refname)
|
||||
if not match:
|
||||
return
|
||||
|
||||
version = match.group(1)
|
||||
version_py_dir = os.path.join(dst, "python", "mscclpp")
|
||||
if os.path.isdir(version_py_dir):
|
||||
version_py = os.path.join(version_py_dir, "_version.py")
|
||||
if not os.path.exists(version_py):
|
||||
with open(version_py, "w") as f:
|
||||
f.write(f'__version__ = "{version}"\n')
|
||||
|
||||
|
||||
# Monkey-patch
|
||||
smv_git.copy_tree = _patched_copy_tree
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(smv_main(sys.argv[1:]))
|
||||
@@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol
|
||||
python3 -m mscclpp --install
|
||||
```
|
||||
|
||||
This installs bundled default execution plans into `~/.cache/mscclpp/default` by default.
|
||||
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`.
|
||||
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
|
||||
|
||||
## Your First Algorithm: AllGather
|
||||
|
||||
Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
|
||||
|
||||
@@ -59,6 +59,9 @@ After installation, the generated JSON execution plan can be found at:
|
||||
~/.cache/mscclpp/default/
|
||||
```
|
||||
|
||||
If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`.
|
||||
`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path.
|
||||
|
||||
**Performance Results:**
|
||||
|
||||
The figure below shows the performance characteristics for small message sizes in a two-node configuration:
|
||||
|
||||
@@ -332,7 +332,8 @@ public:
|
||||
size_t inputSize, size_t outputSize,
|
||||
mscclpp::DataType dtype, mscclpp::ReduceOp op,
|
||||
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
[[maybe_unused]] mscclpp::DataType accumDtype) {
|
||||
return self->kernelFunc(ctx, input, output, inputSize, dtype, stream);
|
||||
},
|
||||
// Context initialization function
|
||||
|
||||
@@ -25,12 +25,15 @@
|
||||
```bash
|
||||
sudo apt-get install libnuma-dev
|
||||
```
|
||||
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
|
||||
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package
|
||||
```bash
|
||||
sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
|
||||
sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)"
|
||||
```
|
||||
If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
|
||||
* (Optional, for benchmarks) MPI
|
||||
* (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1
|
||||
* GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions).
|
||||
* Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation).
|
||||
* Others
|
||||
* For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
|
||||
* For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
|
||||
@@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se
|
||||
|
||||
```bash
|
||||
# For NVIDIA platforms
|
||||
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash
|
||||
$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash
|
||||
# For AMD platforms
|
||||
$ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash
|
||||
```
|
||||
@@ -97,13 +100,30 @@ There are a few optional CMake options you can set:
|
||||
(install-from-source-python-module)=
|
||||
## Install from Source (Python Module)
|
||||
|
||||
Python 3.8 or later is required.
|
||||
Python 3.10 or later is required.
|
||||
|
||||
```bash
|
||||
# For NVIDIA platforms
|
||||
$ python -m pip install .
|
||||
# For AMD platforms, set the C++ compiler to HIPCC
|
||||
$ CXX=/opt/rocm/bin/hipcc python -m pip install .
|
||||
# For NVIDIA platforms (specify your CUDA version)
|
||||
$ python -m pip install ".[cuda12]"
|
||||
# For AMD platforms
|
||||
$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
|
||||
```
|
||||
|
||||
> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
|
||||
> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
|
||||
> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
|
||||
|
||||
Optional extras can be installed by specifying them in brackets. Available extras:
|
||||
- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
|
||||
- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
|
||||
- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
|
||||
- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
|
||||
|
||||
```bash
|
||||
# Example: install with CUDA 12 and benchmark extras
|
||||
$ python -m pip install ".[cuda12,benchmark]"
|
||||
# Example: install with all extras for testing on CUDA 12
|
||||
$ python -m pip install ".[cuda12,benchmark,test]"
|
||||
```
|
||||
|
||||
(vscode-dev-container)=
|
||||
@@ -155,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
|
||||
[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
|
||||
|
||||
```bash
|
||||
# Choose `requirements_*.txt` according to your CUDA/ROCm version.
|
||||
$ python3 -m pip install -r ./python/requirements_cuda12.txt
|
||||
# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
|
||||
# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
|
||||
$ python3 -m pip install ".[cuda12,benchmark,test]"
|
||||
$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
```
|
||||
|
||||
@@ -171,7 +192,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap
|
||||
For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory.
|
||||
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
@@ -189,13 +209,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec
|
||||
|
||||
Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list.
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist.
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH;
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50
|
||||
```
|
||||
|
||||
|
||||
@@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
|
||||
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
|
||||
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
|
||||
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
[[maybe_unused]] mscclpp::DataType accumDtype) {
|
||||
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream);
|
||||
},
|
||||
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
|
||||
@@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
|
||||
"allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
|
||||
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize, size_t outputSize,
|
||||
mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks,
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
[[maybe_unused]] mscclpp::DataType accumDtype) {
|
||||
return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream);
|
||||
},
|
||||
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
|
||||
@@ -1,193 +1,117 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py
|
||||
# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py
|
||||
|
||||
import os
|
||||
import torch
|
||||
import mscclpp.utils as mscclpp_utils
|
||||
import mscclpp
|
||||
import mscclpp.ext
|
||||
import netifaces as ni
|
||||
import ipaddress
|
||||
|
||||
import netifaces as ni
|
||||
import torch
|
||||
import mscclpp
|
||||
import mscclpp.ext
|
||||
import mscclpp.utils as mscclpp_utils
|
||||
|
||||
def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
|
||||
collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
|
||||
return collection_builder.build_default_algorithms(
|
||||
scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
|
||||
# -- Helpers ------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor:
|
||||
"""Allocate a tensor backed by RawGpuBuffer (symmetric memory)."""
|
||||
# PyTorch's from_dlpack does not support certain float8 DLPack type codes.
|
||||
# Work around by importing as uint8 and reinterpreting via .view().
|
||||
_DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)
|
||||
if dtype in _DLPACK_UNSUPPORTED:
|
||||
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8))
|
||||
return torch.utils.dlpack.from_dlpack(dlpack).view(dtype)
|
||||
dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype))
|
||||
return torch.utils.dlpack.from_dlpack(dlpack)
|
||||
|
||||
|
||||
def _load_algorithms(scratch: torch.Tensor, rank: int):
|
||||
return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms(
|
||||
scratch_buffer=scratch.data_ptr(),
|
||||
scratch_buffer_size=scratch.nbytes,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
|
||||
def interfaces_for_ip_netifaces(ip: str):
|
||||
def _interfaces_for_ip(ip: str):
|
||||
target = ipaddress.ip_address(ip)
|
||||
for interface in ni.interfaces():
|
||||
addresses = ni.ifaddresses(interface)
|
||||
if ni.AF_INET in addresses:
|
||||
for link in addresses[ni.AF_INET]:
|
||||
if "addr" in link:
|
||||
addr = ipaddress.ip_address(link["addr"])
|
||||
if addr == target:
|
||||
return interface
|
||||
for iface in ni.interfaces():
|
||||
addrs = ni.ifaddresses(iface)
|
||||
if ni.AF_INET in addrs:
|
||||
for link in addrs[ni.AF_INET]:
|
||||
if "addr" in link and ipaddress.ip_address(link["addr"]) == target:
|
||||
return iface
|
||||
return None
|
||||
|
||||
|
||||
def to_mscclpp_reduce_op(op: torch.distributed.ReduceOp) -> mscclpp.ReduceOp:
|
||||
def _to_mscclpp_op(op) -> mscclpp.ReduceOp:
|
||||
if op == torch.distributed.ReduceOp.SUM:
|
||||
return mscclpp.ReduceOp.SUM
|
||||
elif op == torch.distributed.ReduceOp.MIN:
|
||||
if op == torch.distributed.ReduceOp.MIN:
|
||||
return mscclpp.ReduceOp.MIN
|
||||
else:
|
||||
raise ValueError(f"unsupported op: {op}")
|
||||
raise ValueError(f"unsupported op: {op}")
|
||||
|
||||
|
||||
def _round_pow2(size: int) -> int:
|
||||
"""Round up to next power-of-2, clamped to [1024, 256 MB]."""
|
||||
size = max(size, 1024)
|
||||
size = min(size, 256 << 20)
|
||||
return 1 << (size - 1).bit_length()
|
||||
|
||||
|
||||
# -- CustomizedComm -----------------------------------------------------------
|
||||
|
||||
|
||||
class CustomizedComm:
|
||||
def __init__(self, comm: mscclpp.CommGroup):
|
||||
"""Exposes all_reduce, all_gather, barrier with lazy per-size tuning."""
|
||||
|
||||
_TUNE_N_WARMUP = 5
|
||||
_TUNE_N_GRAPH_LAUNCHES = 10
|
||||
_TUNE_N_OPS_PER_GRAPH = 100
|
||||
_CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128]
|
||||
_CANDIDATE_NTHREADS = [512, 768, 1024]
|
||||
_NBLOCKS_LIMIT = {
|
||||
"default_allreduce_nvls_packet": 16,
|
||||
"default_allreduce_packet": 56,
|
||||
"default_allreduce_allpair_packet": 56,
|
||||
"default_allreduce_fullmesh": 64,
|
||||
"default_allgather_fullmesh2": 32,
|
||||
}
|
||||
|
||||
def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False):
|
||||
self.comm = comm
|
||||
self.rank = comm.my_rank
|
||||
self.world_size = comm.nranks
|
||||
self.local_rank = comm.my_rank % comm.nranks_per_node
|
||||
self.n_ranks_per_node = comm.nranks_per_node
|
||||
dlpack = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
|
||||
self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
|
||||
algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
|
||||
self._algorithm_nvls_packet = [
|
||||
algo
|
||||
for algo in algorithms
|
||||
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
|
||||
][0]
|
||||
self._algorithm_rsag_zero_copy = [
|
||||
algo
|
||||
for algo in algorithms
|
||||
if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy"
|
||||
][0]
|
||||
self._algorithm_packet = [
|
||||
algo for algo in algorithms if algo.collective == "allreduce" and algo.name == "default_allreduce_packet"
|
||||
][0]
|
||||
if mscclpp.is_nvls_supported():
|
||||
self._algorithm_nvls_zero_copy = [
|
||||
algo
|
||||
for algo in algorithms
|
||||
if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy"
|
||||
][0]
|
||||
self._tune(n_warmup=5, n_graph_launches=10, n_ops_per_graph=100)
|
||||
self.symmetric_memory = symmetric_memory
|
||||
self._nvls = mscclpp.is_nvls_supported()
|
||||
|
||||
def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph):
|
||||
sizes = [1 << i for i in range(10, 28)]
|
||||
# Pre-fill with defaults for barrier
|
||||
self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)}
|
||||
self._scratch = _make_tensor(1 << 27, torch.float16)
|
||||
self._barrier_tensor = _make_tensor(4096, torch.float32)
|
||||
|
||||
tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
|
||||
tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor)
|
||||
tune_tensor.normal_()
|
||||
candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128]
|
||||
candidates_nthreads = [512, 768, 1024]
|
||||
algos = _load_algorithms(self._scratch, self.rank)
|
||||
self._algos = {(a.collective, a.name): a for a in algos}
|
||||
|
||||
for size in sizes:
|
||||
algos = []
|
||||
if mscclpp.is_nvls_supported():
|
||||
algos.append(self._algorithm_nvls_zero_copy)
|
||||
if size <= 4 * 1024 * 1024:
|
||||
algos.append(self._algorithm_nvls_packet)
|
||||
algos.append(self._algorithm_packet)
|
||||
if size >= 512 * 1024:
|
||||
algos.append(self._algorithm_rsag_zero_copy)
|
||||
# {collective: {rounded_size: (algo, nblocks, nthreads)}}
|
||||
self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}}
|
||||
self._tune_buf = None
|
||||
self._time_buf = None
|
||||
|
||||
best_time = float("inf")
|
||||
best_config = None
|
||||
def _algo(self, collective: str, name: str):
|
||||
return self._algos.get((collective, name))
|
||||
|
||||
for algo in algos:
|
||||
for nb in candidates_nblocks:
|
||||
if algo.name == "default_allreduce_nvls_packet" and nb > 16:
|
||||
continue
|
||||
if algo.name == "default_allreduce_packet" and nb > 56:
|
||||
continue
|
||||
for nt in candidates_nthreads:
|
||||
if self._run_algo(algo, tune_tensor, size, nb, nt) != 0:
|
||||
continue
|
||||
def _default_ar_config(self):
|
||||
"""Fallback allreduce config for barrier / timing sync."""
|
||||
pkt = self._algo("allreduce", "default_allreduce_nvls_packet")
|
||||
if self._nvls and pkt:
|
||||
return (pkt, 0, 0)
|
||||
return (self._algo("allreduce", "default_allreduce_packet"), 0, 0)
|
||||
|
||||
for _ in range(n_warmup):
|
||||
self._run_algo(algo, tune_tensor, size, nb, nt)
|
||||
self.barrier()
|
||||
# -- low-level execute --
|
||||
|
||||
capture_stream = torch.cuda.Stream()
|
||||
capture_stream.wait_stream(torch.cuda.current_stream())
|
||||
|
||||
g = torch.cuda.CUDAGraph()
|
||||
# Warmup on capture stream
|
||||
with torch.cuda.stream(capture_stream):
|
||||
self._run_algo(algo, tune_tensor, size, nb, nt)
|
||||
capture_stream.synchronize()
|
||||
|
||||
with torch.cuda.graph(g, stream=capture_stream):
|
||||
for _ in range(n_ops_per_graph):
|
||||
self._run_algo(algo, tune_tensor, size, nb, nt)
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event.record(capture_stream)
|
||||
with torch.cuda.stream(capture_stream):
|
||||
for _ in range(n_graph_launches):
|
||||
g.replay()
|
||||
end_event.record(capture_stream)
|
||||
end_event.synchronize()
|
||||
|
||||
elapsed = start_event.elapsed_time(end_event)
|
||||
|
||||
# Synchronize timing results across all ranks to ensure consistent algorithm selection
|
||||
# replicate n times such due to algo limitations
|
||||
time_tensor = torch.full((self.world_size,), elapsed, dtype=torch.float64, device="cuda").to(
|
||||
dtype=torch.float32
|
||||
)
|
||||
torch.cuda.current_stream().wait_stream(capture_stream)
|
||||
# TODO: use all_reduce may cause problem if the time elapsed between different algos are too close.
|
||||
# May change to broadcast in the future if that becomes an issue.
|
||||
self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM)
|
||||
avg_time = time_tensor[self.rank].item() / self.world_size
|
||||
|
||||
if avg_time < best_time:
|
||||
best_time = avg_time
|
||||
best_config = (algo, nb, nt)
|
||||
|
||||
if best_config:
|
||||
self.best_configs[size] = best_config
|
||||
if self.rank == 0:
|
||||
print(
|
||||
f"Size {size}: Best Algo {best_config[0].name} nblocks {best_config[1]} nthreads {best_config[2]} Time {(best_time/(n_graph_launches * n_ops_per_graph))*1000:.2f} us"
|
||||
)
|
||||
# reset the algorithms after tuning
|
||||
torch.cuda.synchronize()
|
||||
for algo in algos:
|
||||
algo.reset()
|
||||
|
||||
def _run_algo(self, algo: mscclpp.Algorithm, tensor, size, nblocks, nthreads):
|
||||
return algo.execute(
|
||||
comm=self.comm.communicator,
|
||||
input_buffer=tensor.data_ptr(),
|
||||
output_buffer=tensor.data_ptr(),
|
||||
input_size=size,
|
||||
output_size=size,
|
||||
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
|
||||
op=mscclpp.ReduceOp.SUM,
|
||||
stream=torch.cuda.current_stream().cuda_stream,
|
||||
nblocks=nblocks,
|
||||
nthreads_per_block=nthreads,
|
||||
symmetric_memory=True,
|
||||
)
|
||||
|
||||
def get_tuned_config(self, size):
|
||||
if size < 1024:
|
||||
target_size = 1024
|
||||
elif size > 256 * 1024 * 1024:
|
||||
target_size = 256 * 1024 * 1024
|
||||
else:
|
||||
target_size = 1 << (size - 1).bit_length()
|
||||
return self.best_configs.get(target_size)
|
||||
|
||||
def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
|
||||
assert op == torch.distributed.ReduceOp.SUM
|
||||
config = self.get_tuned_config(tensor.nbytes)
|
||||
algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0)
|
||||
def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True):
|
||||
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
|
||||
ret = algo.execute(
|
||||
comm=self.comm.communicator,
|
||||
input_buffer=tensor.data_ptr(),
|
||||
@@ -195,107 +119,357 @@ class CustomizedComm:
|
||||
input_size=tensor.nbytes,
|
||||
output_size=tensor.nbytes,
|
||||
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype),
|
||||
op=to_mscclpp_reduce_op(op),
|
||||
stream=stream.cuda_stream if stream is not None else torch.cuda.current_stream().cuda_stream,
|
||||
nblocks=nblocks,
|
||||
nthreads_per_block=nthreads,
|
||||
symmetric_memory=True,
|
||||
op=op,
|
||||
stream=s,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
symmetric_memory=sym,
|
||||
accum_dtype=accum_dtype,
|
||||
)
|
||||
if ret != 0:
|
||||
print(f"Rank {self.rank}: Algo {algo.name} failed with error {ret}")
|
||||
print(f"Rank {self.rank}: {algo.name} failed ({ret})")
|
||||
return ret
|
||||
|
||||
def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None):
|
||||
if sym is None:
|
||||
sym = self.symmetric_memory
|
||||
s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream
|
||||
ret = algo.execute(
|
||||
comm=self.comm.communicator,
|
||||
input_buffer=inp.data_ptr(),
|
||||
output_buffer=out.data_ptr(),
|
||||
input_size=inp.nbytes,
|
||||
output_size=out.nbytes,
|
||||
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype),
|
||||
op=mscclpp.ReduceOp.NOP,
|
||||
stream=s,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
symmetric_memory=sym,
|
||||
)
|
||||
if ret != 0:
|
||||
print(f"Rank {self.rank}: AG {algo.name} failed ({ret})")
|
||||
return ret
|
||||
|
||||
def _barrier_internal(self):
|
||||
a, nb, nt = self._default_ar_config()
|
||||
self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True)
|
||||
|
||||
# -- lazy tuning --
|
||||
|
||||
def _ensure_tune_bufs(self):
|
||||
if self._tune_buf is None:
|
||||
self._tune_buf = _make_tensor(1 << 27, torch.float16)
|
||||
self._tune_buf.normal_()
|
||||
self._time_buf = _make_tensor(4096, torch.float32)
|
||||
return self._tune_buf
|
||||
|
||||
def _ar_candidates(self, size: int):
|
||||
out = []
|
||||
if size <= 4 << 20:
|
||||
a = self._algo("allreduce", "default_allreduce_nvls_packet")
|
||||
if self._nvls and a:
|
||||
out.append(a)
|
||||
a = self._algo("allreduce", "default_allreduce_packet")
|
||||
if a:
|
||||
out.append(a)
|
||||
a = self._algo("allreduce", "default_allreduce_allpair_packet")
|
||||
if a:
|
||||
out.append(a)
|
||||
if size >= 512 << 10:
|
||||
a = self._algo("allreduce", "default_allreduce_nvls_zero_copy")
|
||||
if self._nvls and self.symmetric_memory and a:
|
||||
out.append(a)
|
||||
a = self._algo("allreduce", "default_allreduce_rsag_zero_copy")
|
||||
if a:
|
||||
out.append(a)
|
||||
if torch.version.hip is not None:
|
||||
a = self._algo("allreduce", "default_allreduce_fullmesh")
|
||||
if a:
|
||||
out.append(a)
|
||||
return out
|
||||
|
||||
def _ag_candidates(self):
|
||||
a = self._algo("allgather", "default_allgather_fullmesh2")
|
||||
return [a] if a else []
|
||||
|
||||
def _run_tune(self, collective, algo, buf, size, nb, nt):
|
||||
"""Single tune invocation for either collective."""
|
||||
if collective == "allreduce":
|
||||
return algo.execute(
|
||||
comm=self.comm.communicator,
|
||||
input_buffer=buf.data_ptr(),
|
||||
output_buffer=buf.data_ptr(),
|
||||
input_size=size,
|
||||
output_size=size,
|
||||
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
|
||||
op=mscclpp.ReduceOp.SUM,
|
||||
stream=torch.cuda.current_stream().cuda_stream,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
symmetric_memory=True,
|
||||
)
|
||||
else:
|
||||
total = size * self.world_size
|
||||
out_ptr = buf.data_ptr()
|
||||
return algo.execute(
|
||||
comm=self.comm.communicator,
|
||||
input_buffer=out_ptr + self.rank * size,
|
||||
output_buffer=out_ptr,
|
||||
input_size=size,
|
||||
output_size=total,
|
||||
dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype),
|
||||
op=mscclpp.ReduceOp.NOP,
|
||||
stream=torch.cuda.current_stream().cuda_stream,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
symmetric_memory=False,
|
||||
)
|
||||
|
||||
def _tune_size(self, collective: str, target_size: int):
|
||||
"""Auto-tune one (collective, target_size) pair and cache result."""
|
||||
buf = self._ensure_tune_bufs()
|
||||
cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates()
|
||||
|
||||
best_time, best_cfg = float("inf"), None
|
||||
used = set()
|
||||
run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt)
|
||||
|
||||
for algo in cands:
|
||||
nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128)
|
||||
for nb in self._CANDIDATE_NBLOCKS:
|
||||
if nb > nb_limit:
|
||||
continue
|
||||
for nt in self._CANDIDATE_NTHREADS:
|
||||
# Feasibility — sync result across ranks so all agree
|
||||
ret = run(algo, nb, nt)
|
||||
torch.cuda.synchronize()
|
||||
self._time_buf[0] = float(ret)
|
||||
self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True)
|
||||
if self._time_buf[0].item() != 0:
|
||||
continue
|
||||
used.add(algo)
|
||||
|
||||
# Warmup
|
||||
for _ in range(self._TUNE_N_WARMUP):
|
||||
run(algo, nb, nt)
|
||||
|
||||
# CUDA-graph timed benchmark
|
||||
cs = torch.cuda.Stream()
|
||||
cs.wait_stream(torch.cuda.current_stream())
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g, stream=cs):
|
||||
for _ in range(self._TUNE_N_OPS_PER_GRAPH):
|
||||
run(algo, nb, nt)
|
||||
|
||||
start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
|
||||
start.record(cs)
|
||||
with torch.cuda.stream(cs):
|
||||
for _ in range(self._TUNE_N_GRAPH_LAUNCHES):
|
||||
g.replay()
|
||||
end.record(cs)
|
||||
end.synchronize()
|
||||
elapsed = start.elapsed_time(end)
|
||||
|
||||
# Cross-rank timing sync
|
||||
self._time_buf.fill_(elapsed)
|
||||
torch.cuda.current_stream().wait_stream(cs)
|
||||
self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True)
|
||||
avg = self._time_buf[self.rank].item() / self.world_size
|
||||
|
||||
if avg < best_time:
|
||||
best_time, best_cfg = avg, (algo, nb, nt)
|
||||
|
||||
if best_cfg:
|
||||
self._tune_cache[collective][target_size] = best_cfg
|
||||
if self.rank == 0:
|
||||
n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH
|
||||
print(
|
||||
f"[tune] {collective} size={target_size}: {best_cfg[0].name} "
|
||||
f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us",
|
||||
flush=True,
|
||||
)
|
||||
else:
|
||||
fb = (
|
||||
self._default_ar_config()
|
||||
if collective == "allreduce"
|
||||
else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None)
|
||||
)
|
||||
self._tune_cache[collective][target_size] = fb
|
||||
|
||||
torch.cuda.synchronize()
|
||||
self._barrier_internal()
|
||||
for a in used:
|
||||
a.reset()
|
||||
|
||||
# -- public API --
|
||||
|
||||
def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None):
|
||||
sz = _round_pow2(tensor.nbytes)
|
||||
if sz not in self._tune_cache["allreduce"]:
|
||||
self._tune_size("allreduce", sz)
|
||||
a, nb, nt = self._tune_cache["allreduce"][sz]
|
||||
self._exec_ar(
|
||||
tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory
|
||||
)
|
||||
|
||||
def all_gather(self, output_tensor, input_tensor, stream=None):
|
||||
sz = _round_pow2(input_tensor.nbytes)
|
||||
if sz not in self._tune_cache["allgather"]:
|
||||
self._tune_size("allgather", sz)
|
||||
a, nb, nt = self._tune_cache["allgather"][sz]
|
||||
self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory)
|
||||
|
||||
def barrier(self):
|
||||
tensor = torch.empty(self.world_size, dtype=torch.float, device=torch.device("cuda"))
|
||||
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
|
||||
|
||||
def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100):
|
||||
low = 5 * 1024
|
||||
high = 80 * 1024 * 1024
|
||||
sizes = []
|
||||
curr = low
|
||||
while curr <= high:
|
||||
sizes.append(curr)
|
||||
curr *= 2
|
||||
|
||||
if self.rank == 0:
|
||||
print(f"{'Size (Bytes)':<20} {'Time (us)':<20} {'AlgoBW (GB/s)':<20}")
|
||||
|
||||
dtype = torch.float16
|
||||
capture_stream = torch.cuda.Stream()
|
||||
|
||||
# Allocate a single large RawGpuBuffer (symmetric memory) and reuse it for all sizes.
|
||||
# Cannot allocate per-size tensors with symmetric memory.
|
||||
bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype))
|
||||
bench_buf = torch.utils.dlpack.from_dlpack(bench_buf)
|
||||
bench_buf.normal_()
|
||||
|
||||
for size in sizes:
|
||||
n_elements = size // bench_buf.element_size()
|
||||
tensor = bench_buf[:n_elements]
|
||||
|
||||
capture_stream.wait_stream(torch.cuda.current_stream())
|
||||
# Capture Graph
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g, stream=capture_stream):
|
||||
for _ in range(n_iter_per_graph):
|
||||
self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
|
||||
|
||||
# warmup: Execute the graph once to prime the driver
|
||||
with torch.cuda.stream(capture_stream):
|
||||
for _ in range(n_warmup):
|
||||
g.replay()
|
||||
self.barrier()
|
||||
capture_stream.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start_event.record(capture_stream)
|
||||
with torch.cuda.stream(capture_stream):
|
||||
for _ in range(n_graph_launches):
|
||||
g.replay()
|
||||
end_event.record(capture_stream)
|
||||
end_event.synchronize()
|
||||
|
||||
# Get elapsed time in milliseconds
|
||||
elapsed_ms = start_event.elapsed_time(end_event)
|
||||
avg_time_ms = elapsed_ms / (n_graph_launches * n_iter_per_graph)
|
||||
time_us = avg_time_ms * 1000
|
||||
|
||||
alg_bw = size / (avg_time_ms * 1e-3) if avg_time_ms > 0 else 0
|
||||
if self.rank == 0:
|
||||
print(f"{size:<20} {time_us:<20.2f} {alg_bw / 1e9:<20.2f}")
|
||||
self._barrier_internal()
|
||||
|
||||
def destroy(self):
|
||||
self._algorithm_nvls_nonzero_copy = None
|
||||
self._algorithm_nvls_packet = None
|
||||
self.scratch_buffer = None
|
||||
self.comm = None
|
||||
self._algos.clear()
|
||||
self._tune_cache = {"allreduce": {}, "allgather": {}}
|
||||
self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None
|
||||
|
||||
|
||||
def init_dist() -> CustomizedComm:
|
||||
rank = int(os.environ["RANK"])
|
||||
world = int(os.environ["WORLD_SIZE"])
|
||||
master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
|
||||
master_port = os.environ["MSCCLPP_MASTER_PORT"]
|
||||
interface = interfaces_for_ip_netifaces(master_addr)
|
||||
if interface is None:
|
||||
raise ValueError(f"Cannot find network interface for IP address {master_addr}")
|
||||
interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
|
||||
mscclpp_group = mscclpp.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
|
||||
return CustomizedComm(mscclpp_group)
|
||||
# -- Benchmarks (standalone) --------------------------------------------------
|
||||
|
||||
|
||||
def _bench_sizes(low=5 * 1024, high=80 << 20):
|
||||
sizes, c = [], low
|
||||
while c <= high:
|
||||
sizes.append(c)
|
||||
c *= 2
|
||||
return sizes
|
||||
|
||||
|
||||
def benchmark_allreduce(
|
||||
comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100
|
||||
):
|
||||
sizes = _bench_sizes()
|
||||
if comm.rank == 0:
|
||||
print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}")
|
||||
print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
|
||||
|
||||
cs = torch.cuda.Stream()
|
||||
buf = _make_tensor(1 << 27, dtype)
|
||||
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
|
||||
|
||||
for size in sizes:
|
||||
nelems = size // buf.element_size()
|
||||
t = buf[: size // buf.element_size()]
|
||||
comm.all_reduce(t, accum_dtype=accum_dtype)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
cs.wait_stream(torch.cuda.current_stream())
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g, stream=cs):
|
||||
for _ in range(n_iter):
|
||||
comm.all_reduce(t, accum_dtype=accum_dtype)
|
||||
with torch.cuda.stream(cs):
|
||||
for _ in range(n_warmup):
|
||||
g.replay()
|
||||
comm.barrier()
|
||||
cs.synchronize()
|
||||
|
||||
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
|
||||
s.record(cs)
|
||||
with torch.cuda.stream(cs):
|
||||
for _ in range(n_graph_launches):
|
||||
g.replay()
|
||||
e.record(cs)
|
||||
e.synchronize()
|
||||
|
||||
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
|
||||
if comm.rank == 0:
|
||||
print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}")
|
||||
|
||||
|
||||
def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100):
|
||||
sizes = _bench_sizes()
|
||||
if comm.rank == 0:
|
||||
print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}")
|
||||
print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}")
|
||||
|
||||
cs = torch.cuda.Stream()
|
||||
buf = _make_tensor(1 << 27, dtype)
|
||||
buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0)
|
||||
|
||||
for prs in sizes:
|
||||
total = prs * comm.world_size
|
||||
if total > buf.nbytes:
|
||||
break
|
||||
nt = total // buf.element_size()
|
||||
npr = prs // buf.element_size()
|
||||
out = buf[:nt]
|
||||
inp = out[comm.rank * npr : (comm.rank + 1) * npr]
|
||||
|
||||
comm.all_gather(out, inp)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
cs.wait_stream(torch.cuda.current_stream())
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g, stream=cs):
|
||||
for _ in range(n_iter):
|
||||
comm.all_gather(out, inp)
|
||||
with torch.cuda.stream(cs):
|
||||
for _ in range(n_warmup):
|
||||
g.replay()
|
||||
comm.barrier()
|
||||
cs.synchronize()
|
||||
|
||||
s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
|
||||
s.record(cs)
|
||||
with torch.cuda.stream(cs):
|
||||
for _ in range(n_graph_launches):
|
||||
g.replay()
|
||||
e.record(cs)
|
||||
e.synchronize()
|
||||
|
||||
ms = s.elapsed_time(e) / (n_graph_launches * n_iter)
|
||||
if comm.rank == 0:
|
||||
print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}")
|
||||
|
||||
|
||||
# -- Bootstrap & main ---------------------------------------------------------
|
||||
|
||||
|
||||
def init_dist() -> mscclpp.CommGroup:
|
||||
addr = os.environ.get("MSCCLPP_MASTER_ADDR")
|
||||
if addr:
|
||||
rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"])
|
||||
port = os.environ["MSCCLPP_MASTER_PORT"]
|
||||
iface = _interfaces_for_ip(addr)
|
||||
if not iface:
|
||||
raise ValueError(f"No interface for {addr}")
|
||||
return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world)
|
||||
import torch.distributed as dist
|
||||
|
||||
dist.init_process_group(backend="gloo")
|
||||
return mscclpp.CommGroup(torch_group=dist.group.WORLD)
|
||||
|
||||
|
||||
def main():
|
||||
local = int(os.environ["LOCAL_RANK"])
|
||||
torch.cuda.set_device(local)
|
||||
comm = init_dist()
|
||||
comm.benchmark(n_warmup=5, n_graph_launches=10, n_iter_per_graph=100)
|
||||
comm.barrier()
|
||||
|
||||
dtype_str = os.environ.get("DTYPE", "float16")
|
||||
dtype = getattr(torch, dtype_str, torch.float16)
|
||||
accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16}
|
||||
accum_str = os.environ.get("ACCUM_DTYPE")
|
||||
accum_dtype = accum_map.get(accum_str) if accum_str else None
|
||||
|
||||
comm_group = init_dist()
|
||||
cc = CustomizedComm(comm_group)
|
||||
|
||||
print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...")
|
||||
benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype)
|
||||
cc.barrier()
|
||||
torch.cuda.synchronize()
|
||||
comm.destroy()
|
||||
print(f"rank {local} All-reduce operation completed successfully.")
|
||||
|
||||
benchmark_allgather(cc, dtype=dtype)
|
||||
cc.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
cc.destroy()
|
||||
print(f"rank {local} completed successfully.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
|
||||
# LD_PRELOAD=<MSCCLPP_REPO>/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
|
||||
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
import torch, torch.distributed as dist
|
||||
import mscclpp
|
||||
import mscclpp.ext
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType
|
||||
from mscclpp.language.program import CollectiveProgram
|
||||
from mscclpp.language.rank import Rank
|
||||
from mscclpp.language.utils import AlgoSpec
|
||||
|
||||
|
||||
def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
|
||||
def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram:
|
||||
gpu_size = spec.world_size
|
||||
with CollectiveProgram.from_spec(spec) as program:
|
||||
# Creating Channels
|
||||
@@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
|
||||
return program
|
||||
|
||||
|
||||
def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int):
|
||||
spec = mscclpp.AlgoSpec(
|
||||
def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int):
|
||||
spec = AlgoSpec(
|
||||
name="allreduce_nvls",
|
||||
collective=AllReduce(8, 1, True),
|
||||
nranks_per_node=8,
|
||||
@@ -94,10 +95,10 @@ def init_dist():
|
||||
rank = int(os.environ["RANK"])
|
||||
world = int(os.environ["WORLD_SIZE"])
|
||||
local = int(os.environ["LOCAL_RANK"])
|
||||
algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder()
|
||||
algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder()
|
||||
setup_plan(algorithm_collection_builder, rank, world)
|
||||
algorithm_collection_builder.set_algorithm_selector(selector)
|
||||
dist.init_process_group(backend="nccl", device_id=local)
|
||||
dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local))
|
||||
return rank, world, local
|
||||
|
||||
|
||||
|
||||
@@ -103,12 +103,14 @@ class Algorithm {
|
||||
/// @param nThreadsPerBlock Number of threads per block (0 for auto-selection).
|
||||
/// @param symmetricMemory Whether to use symmetric memory optimization.
|
||||
/// @param extras Additional parameters for algorithm-specific customization.
|
||||
/// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype.
|
||||
/// @return The result of the operation.
|
||||
virtual CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
|
||||
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
|
||||
bool symmetricMemory = false,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras = {}) = 0;
|
||||
const std::unordered_map<std::string, uintptr_t>& extras = {},
|
||||
DataType accumDtype = DataType::AUTO) = 0;
|
||||
|
||||
/// Reset the algorithm state, clearing any cached contexts.
|
||||
virtual void reset() = 0;
|
||||
@@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm {
|
||||
/// @param nBlocks Number of CUDA blocks.
|
||||
/// @param nThreadsPerBlock Number of threads per block.
|
||||
/// @param extras Additional algorithm-specific parameters.
|
||||
/// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel).
|
||||
/// @return The result of the operation.
|
||||
using KernelFunc =
|
||||
std::function<CommResult(const std::shared_ptr<void>, const void*, void*, size_t, size_t, DataType, ReduceOp,
|
||||
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&)>;
|
||||
cudaStream_t, int, int, const std::unordered_map<std::string, uintptr_t>&, DataType)>;
|
||||
|
||||
/// Function type for creating algorithm contexts.
|
||||
/// @param comm The communicator.
|
||||
@@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm {
|
||||
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
|
||||
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
|
||||
bool symmetricMemory = false,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
|
||||
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
|
||||
DataType accumDtype = DataType::AUTO) override;
|
||||
const std::string& name() const override;
|
||||
const std::string& collective() const override;
|
||||
const std::pair<size_t, size_t>& messageRange() const override;
|
||||
@@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab
|
||||
CommResult execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream,
|
||||
std::shared_ptr<Executor> executor, int nBlocks = 0, int nThreadsPerBlock = 0,
|
||||
bool symmetricMemory = false,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras = {}) override;
|
||||
bool symmetricMemory = false, const std::unordered_map<std::string, uintptr_t>& extras = {},
|
||||
DataType accumDtype = DataType::AUTO) override;
|
||||
AlgorithmType type() const override { return AlgorithmType::DSL; }
|
||||
Constraint constraint() const override;
|
||||
void reset() override;
|
||||
|
||||
@@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o
|
||||
return cuda::atomic_ref<T, Scope>{*ptr}.fetch_add(val, memoryOrder);
|
||||
}
|
||||
|
||||
#elif defined(MSCCLPP_DEVICE_HIP)
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
|
||||
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
|
||||
@@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
|
||||
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
|
||||
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
|
||||
|
||||
// HIP does not have thread scope enums like CUDA
|
||||
constexpr auto scopeSystem = 0;
|
||||
constexpr auto scopeDevice = 0;
|
||||
|
||||
@@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde
|
||||
return __atomic_fetch_add(ptr, val, memoryOrder);
|
||||
}
|
||||
|
||||
#endif // defined(MSCCLPP_DEVICE_HIP)
|
||||
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -390,7 +390,7 @@ struct EndpointConfig {
|
||||
};
|
||||
|
||||
static constexpr int DefaultPort = -1;
|
||||
static constexpr int DefaultGidIndex = 0;
|
||||
static constexpr int DefaultGidIndex = -1;
|
||||
static constexpr int DefaultMaxCqSize = 1024;
|
||||
static constexpr int DefaultMaxCqPollNum = 1;
|
||||
static constexpr int DefaultMaxSendWr = 8192;
|
||||
@@ -419,7 +419,7 @@ struct EndpointConfig {
|
||||
/// Constructor.
|
||||
/// @param deviceIndex Device index.
|
||||
/// @param port Port number.
|
||||
/// @param gidIndex GID index.
|
||||
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
|
||||
/// @param maxCqSize Maximum send completion queue size.
|
||||
/// @param maxCqPollNum Maximum send completion queue poll count.
|
||||
/// @param maxSendWr Maximum outstanding send work requests.
|
||||
|
||||
@@ -110,6 +110,11 @@ class Env {
|
||||
/// Default is false.
|
||||
const bool forceDisableNvls;
|
||||
|
||||
/// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
|
||||
/// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
|
||||
/// Default is false.
|
||||
const bool forceDisableGdr;
|
||||
|
||||
/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
|
||||
/// When set to a non-negative value, overrides the `gidIndex` parameter passed to `EndpointConfig::Ib`.
|
||||
/// Default is -1 (unset, uses the constructor argument which defaults to `EndpointConfig::Ib::DefaultGidIndex`).
|
||||
|
||||
@@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162;
|
||||
|
||||
#endif
|
||||
|
||||
/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
|
||||
/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
|
||||
/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
|
||||
/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
|
||||
struct alignas(1) __fp8_e4m3b15 {
|
||||
uint8_t __x;
|
||||
|
||||
__fp8_e4m3b15() = default;
|
||||
|
||||
/// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity).
|
||||
MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {}
|
||||
|
||||
/// Construct from float32 (explicit to avoid ambiguous conversion chains).
|
||||
MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {}
|
||||
|
||||
/// Convert to float32.
|
||||
MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); }
|
||||
|
||||
/// Construct from a raw bit pattern without conversion.
|
||||
static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) {
|
||||
__fp8_e4m3b15 r;
|
||||
r.__x = bits;
|
||||
return r;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Decode fp8_e4m3b15 bits → float32.
|
||||
///
|
||||
/// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler.
|
||||
/// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7.
|
||||
/// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
|
||||
/// then convert fp16 → float32.
|
||||
static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
|
||||
// Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
|
||||
uint32_t exp = (bits >> 3) & 0xFu;
|
||||
if (bits == 0x80 || exp == 15) {
|
||||
union {
|
||||
uint32_t u;
|
||||
float f;
|
||||
} nan_val = {0x7FC00000u};
|
||||
return nan_val.f;
|
||||
}
|
||||
if (bits == 0) return 0.0f;
|
||||
|
||||
// Triton-style bit manipulation: fp8 → fp16 → fp32.
|
||||
// fp8 layout: [S:1][E:4][M:3] (bias=15)
|
||||
// fp16 layout: [S:1][E:5][M:10] (bias=15)
|
||||
//
|
||||
// Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
|
||||
// to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
|
||||
// Refer:
|
||||
// https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
|
||||
uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16
|
||||
uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position
|
||||
uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign)
|
||||
uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1
|
||||
|
||||
// For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
|
||||
// and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
|
||||
// the subnormal fp16 value since both share bias=15.
|
||||
|
||||
// Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
|
||||
union {
|
||||
uint16_t u;
|
||||
__half h;
|
||||
} cvt = {fp16_bits};
|
||||
return __half2float(cvt.h);
|
||||
}
|
||||
|
||||
/// Encode float32 → fp8_e4m3b15 bits.
|
||||
///
|
||||
/// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8.
|
||||
/// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
|
||||
/// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
|
||||
static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t u;
|
||||
} in = {val};
|
||||
|
||||
// NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
|
||||
if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
|
||||
|
||||
// Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
|
||||
__half h_val = __float2half_rn(val);
|
||||
union {
|
||||
__half h;
|
||||
uint16_t u;
|
||||
} cvt = {h_val};
|
||||
uint16_t fp16_bits = cvt.u;
|
||||
|
||||
// Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
|
||||
uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
|
||||
if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
|
||||
|
||||
// Reconstruct with sign.
|
||||
uint16_t sign16 = fp16_bits & 0x8000u;
|
||||
|
||||
// Triton-style: fp16 → fp8.
|
||||
// fp16 layout: [S:1][E:5][M:10] (bias=15)
|
||||
// fp8 layout: [S:1][E:4][M:3] (bias=15)
|
||||
//
|
||||
// mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080)
|
||||
// This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
|
||||
// Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0
|
||||
// Finally: prmt for byte extraction.
|
||||
//
|
||||
// Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
|
||||
uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
|
||||
// The upper byte now contains [E:4][M:3][round_bit].
|
||||
// Combine with sign and extract.
|
||||
uint16_t with_sign = sign16 | adjusted;
|
||||
uint8_t result = (uint8_t)(with_sign >> 8);
|
||||
|
||||
// Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
|
||||
if ((result & 0x7Fu) == 0) result = 0x00u;
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Packed 2x fp8_e4m3b15 storage.
|
||||
struct alignas(2) __fp8x2_e4m3b15 {
|
||||
uint16_t __x;
|
||||
};
|
||||
|
||||
/// Packed 4x fp8_e4m3b15 storage.
|
||||
struct alignas(4) __fp8x4_e4m3b15 {
|
||||
uint32_t __x;
|
||||
};
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
/// Data types supported by mscclpp operations.
|
||||
enum class DataType {
|
||||
INT32, // 32-bit signed integer.
|
||||
UINT32, // 32-bit unsigned integer.
|
||||
FLOAT16, // IEEE 754 half precision.
|
||||
FLOAT32, // IEEE 754 single precision.
|
||||
BFLOAT16, // bfloat16 precision.
|
||||
FLOAT8_E4M3, // float8 with E4M3 layout.
|
||||
FLOAT8_E5M2, // float8 with E5M2 layout.
|
||||
UINT8, // 8-bit unsigned integer.
|
||||
INT32, // 32-bit signed integer.
|
||||
UINT32, // 32-bit unsigned integer.
|
||||
FLOAT16, // IEEE 754 half precision.
|
||||
FLOAT32, // IEEE 754 single precision.
|
||||
BFLOAT16, // bfloat16 precision.
|
||||
FLOAT8_E4M3, // float8 with E4M3 layout.
|
||||
FLOAT8_E5M2, // float8 with E5M2 layout.
|
||||
UINT8, // 8-bit unsigned integer.
|
||||
FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel).
|
||||
AUTO = 255, // Sentinel: resolve to the input dtype at runtime.
|
||||
};
|
||||
|
||||
/// Word array.
|
||||
@@ -97,6 +230,7 @@ struct alignas(Bytes) Words<Bytes, false> {};
|
||||
template <typename T, int N, typename StorageT>
|
||||
union alignas(sizeof(T) * N) VectorTypeImpl {
|
||||
static_assert(N > 0, "N must be greater than 0");
|
||||
static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size");
|
||||
|
||||
T data[N];
|
||||
Words<sizeof(T) * N> words;
|
||||
@@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl {
|
||||
MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; }
|
||||
};
|
||||
|
||||
// Helper template to get the appropriate vector type for a given element type and count
|
||||
// Helper template to get the appropriate vector type for a given element type and count.
|
||||
template <typename T, int N>
|
||||
struct VectorTypeHelper {
|
||||
using type =
|
||||
VectorTypeImpl<T, N,
|
||||
typename std::conditional_t<N * sizeof(T) == 4, uint32_t,
|
||||
typename std::conditional_t<N * sizeof(T) == 8, uint2, uint4>>>;
|
||||
static constexpr int Bytes = N * sizeof(T);
|
||||
using type = VectorTypeImpl<
|
||||
T, N,
|
||||
std::conditional_t<Bytes == 4, uint32_t,
|
||||
std::conditional_t<Bytes == 8, uint2, std::conditional_t<Bytes <= 16, uint4, Words<Bytes>>>>>;
|
||||
};
|
||||
|
||||
/// Vector type - clean user interface (automatically selects appropriate storage type)
|
||||
@@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2);
|
||||
DEFINE_VEC(f16x8, __half, 8, uint4);
|
||||
DEFINE_VEC(bf16x8, __bfloat16, 8, uint4);
|
||||
|
||||
// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists.
|
||||
using f32x8 = VectorType<float, 8>;
|
||||
using f32x16 = VectorType<float, 16>;
|
||||
using f16x16 = VectorType<__half, 16>;
|
||||
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3);
|
||||
DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3);
|
||||
@@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2);
|
||||
DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2);
|
||||
DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4);
|
||||
#endif
|
||||
|
||||
// fp8_e4m3b15 vectors (always available — software type, no HW dependency)
|
||||
DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15);
|
||||
DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15);
|
||||
DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2);
|
||||
DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4);
|
||||
#undef DEFINE_VEC
|
||||
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
@@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// --- f32x2 arithmetic ---
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000)
|
||||
// Blackwell (SM 10.0+): packed float2 add in a single instruction.
|
||||
return __fadd2_rn(a.storage, b.storage);
|
||||
#else
|
||||
f32x2 result;
|
||||
result.data[0] = a.data[0] + b.data[0];
|
||||
result.data[1] = a.data[1] + b.data[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
|
||||
__half2 result;
|
||||
@@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) {
|
||||
return result;
|
||||
}
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) {
|
||||
// Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd).
|
||||
const f16x2* a2 = reinterpret_cast<const f16x2*>(&a);
|
||||
const f16x2* b2 = reinterpret_cast<const f16x2*>(&b);
|
||||
f16x4 result;
|
||||
f16x2* r2 = reinterpret_cast<f16x2*>(&result);
|
||||
r2[0] = a2[0] + b2[0];
|
||||
r2[1] = a2[1] + b2[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) {
|
||||
__bfloat162 result;
|
||||
@@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) {
|
||||
return (a < b ? a : b);
|
||||
}
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) {
|
||||
f32x2 result;
|
||||
result.data[0] = fminf(a.data[0], b.data[0]);
|
||||
result.data[1] = fminf(a.data[1], b.data[1]);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP)
|
||||
@@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Convert a vector type From to vector type To.
|
||||
/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks,
|
||||
/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached.
|
||||
/// Specialized below for optimized FP8 conversion paths at x2/x4 level.
|
||||
template <typename To, typename From>
|
||||
MSCCLPP_DEVICE_INLINE To to(const From& v) {
|
||||
static_assert(To::Size == From::Size, "to<To, From>: vector sizes must match");
|
||||
constexpr int N = From::Size;
|
||||
|
||||
// Auto-decompose: N > 4 → split into x4 chunks
|
||||
if constexpr (N > 4 && N % 4 == 0) {
|
||||
constexpr int nChunks = N / 4;
|
||||
using FromChunk = VectorType<typename From::ElementType, 4>;
|
||||
using ToChunk = VectorType<typename To::ElementType, 4>;
|
||||
const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
|
||||
To result;
|
||||
ToChunk* out = reinterpret_cast<ToChunk*>(&result);
|
||||
#pragma unroll
|
||||
for (int c = 0; c < nChunks; ++c) {
|
||||
out[c] = to<ToChunk>(in[c]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
// Auto-decompose: N == 4 → split into 2x x2 chunks
|
||||
else if constexpr (N == 4) {
|
||||
using FromChunk = VectorType<typename From::ElementType, 2>;
|
||||
using ToChunk = VectorType<typename To::ElementType, 2>;
|
||||
const FromChunk* in = reinterpret_cast<const FromChunk*>(&v);
|
||||
To result;
|
||||
ToChunk* out = reinterpret_cast<ToChunk*>(&result);
|
||||
out[0] = to<ToChunk>(in[0]);
|
||||
out[1] = to<ToChunk>(in[1]);
|
||||
return result;
|
||||
}
|
||||
// Base case: element-wise conversion
|
||||
else {
|
||||
To result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N; ++i) {
|
||||
result.data[i] = static_cast<typename To::ElementType>(v.data[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) {
|
||||
@@ -551,7 +777,592 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// --- f8_e4m3 -> f32 specializations ---
|
||||
|
||||
/// f8_e4m3x2 -> f32x2.
|
||||
/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
|
||||
/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3x2>(const f8_e4m3x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
|
||||
f32x2 result;
|
||||
result.data[0] = f[0];
|
||||
result.data[1] = f[1];
|
||||
return result;
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
|
||||
f32x2 result;
|
||||
result.data[0] = __half2float(bit_cast<__half>(h2.x));
|
||||
result.data[1] = __half2float(bit_cast<__half>(h2.y));
|
||||
return result;
|
||||
#else
|
||||
f32x2 result;
|
||||
result.data[0] = float(v.data[0]);
|
||||
result.data[1] = float(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f8_e4m3x4 -> f32x4.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3x4>(const f8_e4m3x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false);
|
||||
auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true);
|
||||
f32x4 result;
|
||||
result.data[0] = lo[0];
|
||||
result.data[1] = lo[1];
|
||||
result.data[2] = hi[0];
|
||||
result.data[3] = hi[1];
|
||||
return result;
|
||||
#else
|
||||
const f8_e4m3x2* pair = reinterpret_cast<const f8_e4m3x2*>(&v);
|
||||
f32x2 lo = to<f32x2>(pair[0]);
|
||||
f32x2 hi = to<f32x2>(pair[1]);
|
||||
f32x4 result;
|
||||
result.data[0] = lo.data[0];
|
||||
result.data[1] = lo.data[1];
|
||||
result.data[2] = hi.data[0];
|
||||
result.data[3] = hi.data[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- f8_e5m2 -> f32 specializations ---
|
||||
|
||||
/// f8_e5m2x2 -> f32x2.
|
||||
/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float.
|
||||
/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e5m2x2>(const f8_e5m2x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0);
|
||||
f32x2 result;
|
||||
result.data[0] = f[0];
|
||||
result.data[1] = f[1];
|
||||
return result;
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2);
|
||||
f32x2 result;
|
||||
result.data[0] = __half2float(bit_cast<__half>(h2.x));
|
||||
result.data[1] = __half2float(bit_cast<__half>(h2.y));
|
||||
return result;
|
||||
#else
|
||||
f32x2 result;
|
||||
result.data[0] = float(v.data[0]);
|
||||
result.data[1] = float(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f8_e5m2x4 -> f32x4.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e5m2x4>(const f8_e5m2x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false);
|
||||
auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true);
|
||||
f32x4 result;
|
||||
result.data[0] = lo[0];
|
||||
result.data[1] = lo[1];
|
||||
result.data[2] = hi[0];
|
||||
result.data[3] = hi[1];
|
||||
return result;
|
||||
#else
|
||||
const f8_e5m2x2* pair = reinterpret_cast<const f8_e5m2x2*>(&v);
|
||||
f32x2 lo = to<f32x2>(pair[0]);
|
||||
f32x2 hi = to<f32x2>(pair[1]);
|
||||
f32x4 result;
|
||||
result.data[0] = lo.data[0];
|
||||
result.data[1] = lo.data[1];
|
||||
result.data[2] = hi.data[0];
|
||||
result.data[3] = hi.data[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- f32 -> f8_e4m3 specializations (downcast) ---
|
||||
|
||||
/// f32x2 -> f8_e4m3x2.
|
||||
/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32).
|
||||
/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2).
|
||||
/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f32x2>(const f32x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
|
||||
return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2;
|
||||
h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
|
||||
h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
|
||||
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
|
||||
return bit_cast<f8_e4m3x2>(fp8x2);
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA)
|
||||
__half_raw h0, h1;
|
||||
h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
|
||||
h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
|
||||
f8_e4m3x2 result;
|
||||
result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
|
||||
result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
|
||||
return result;
|
||||
#else
|
||||
f8_e4m3x2 result;
|
||||
result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
|
||||
result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f32x4 -> f8_e4m3x4.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3x4 to<f8_e4m3x4, f32x4>(const f32x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false);
|
||||
packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true);
|
||||
return bit_cast<f8_e4m3x4>(packed);
|
||||
#else
|
||||
f32x2 lo, hi;
|
||||
lo.data[0] = v.data[0];
|
||||
lo.data[1] = v.data[1];
|
||||
hi.data[0] = v.data[2];
|
||||
hi.data[1] = v.data[3];
|
||||
f8_e4m3x2 lo_fp8 = to<f8_e4m3x2>(lo);
|
||||
f8_e4m3x2 hi_fp8 = to<f8_e4m3x2>(hi);
|
||||
f8_e4m3x4 result;
|
||||
result.data[0] = lo_fp8.data[0];
|
||||
result.data[1] = lo_fp8.data[1];
|
||||
result.data[2] = hi_fp8.data[0];
|
||||
result.data[3] = hi_fp8.data[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- f32 -> f8_e5m2 specializations (downcast) ---
|
||||
|
||||
/// f32x2 -> f8_e5m2x2.
|
||||
/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32).
|
||||
/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2).
|
||||
/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e5m2x2 to<f8_e5m2x2, f32x2>(const f32x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
|
||||
return bit_cast<f8_e5m2x2>(static_cast<__hip_fp8x2_storage_t>(packed));
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2;
|
||||
h2.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
|
||||
h2.y = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
|
||||
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2);
|
||||
return bit_cast<f8_e5m2x2>(fp8x2);
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA)
|
||||
__half_raw h0, h1;
|
||||
h0.x = bit_cast<unsigned short>(__float2half_rn(v.data[0]));
|
||||
h1.x = bit_cast<unsigned short>(__float2half_rn(v.data[1]));
|
||||
f8_e5m2x2 result;
|
||||
result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2));
|
||||
result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2));
|
||||
return result;
|
||||
#else
|
||||
f8_e5m2x2 result;
|
||||
result.data[0] = static_cast<__fp8_e5m2>(v.data[0]);
|
||||
result.data[1] = static_cast<__fp8_e5m2>(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f32x4 -> f8_e5m2x4.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e5m2x4 to<f8_e5m2x4, f32x4>(const f32x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false);
|
||||
packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true);
|
||||
return bit_cast<f8_e5m2x4>(packed);
|
||||
#else
|
||||
f32x2 lo, hi;
|
||||
lo.data[0] = v.data[0];
|
||||
lo.data[1] = v.data[1];
|
||||
hi.data[0] = v.data[2];
|
||||
hi.data[1] = v.data[3];
|
||||
f8_e5m2x2 lo_fp8 = to<f8_e5m2x2>(lo);
|
||||
f8_e5m2x2 hi_fp8 = to<f8_e5m2x2>(hi);
|
||||
f8_e5m2x4 result;
|
||||
result.data[0] = lo_fp8.data[0];
|
||||
result.data[1] = lo_fp8.data[1];
|
||||
result.data[2] = hi_fp8.data[0];
|
||||
result.data[3] = hi_fp8.data[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- f8_e4m3 <-> f16 conversion specializations ---
|
||||
|
||||
/// f8_e4m3x2 -> f16x2.
|
||||
/// NVIDIA SM90+: packed intrinsic (1 instruction).
|
||||
/// HIP gfx942: fp8 -> float -> half (via AMD builtin).
|
||||
/// Pre-SM90 / fallback: element-wise scalar conversion.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3x2>(const f8_e4m3x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0);
|
||||
f16x2 result;
|
||||
result.data[0] = __float2half(f[0]);
|
||||
result.data[1] = __float2half(f[1]);
|
||||
return result;
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3);
|
||||
return bit_cast<f16x2>(h2);
|
||||
#else
|
||||
f16x2 result;
|
||||
result.data[0] = static_cast<__half>(v.data[0]);
|
||||
result.data[1] = static_cast<__half>(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f16x2 -> f8_e4m3x2.
|
||||
/// NVIDIA SM90+: packed intrinsic (1 instruction).
|
||||
/// HIP gfx942: half -> float -> fp8 (via AMD builtin).
|
||||
/// Pre-SM90: element-wise scalar conversion.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3x2 to<f8_e4m3x2, f16x2>(const f16x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
float f0 = __half2float(v.data[0]);
|
||||
float f1 = __half2float(v.data[1]);
|
||||
uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false);
|
||||
return bit_cast<f8_e4m3x2>(static_cast<__hip_fp8x2_storage_t>(packed));
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900
|
||||
__half2_raw h2 = bit_cast<__half2_raw>(v);
|
||||
__nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3);
|
||||
return bit_cast<f8_e4m3x2>(fp8x2);
|
||||
#elif defined(MSCCLPP_DEVICE_CUDA)
|
||||
__half_raw h0, h1;
|
||||
h0.x = bit_cast<unsigned short>(v.data[0]);
|
||||
h1.x = bit_cast<unsigned short>(v.data[1]);
|
||||
f8_e4m3x2 result;
|
||||
result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3));
|
||||
result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3));
|
||||
return result;
|
||||
#else
|
||||
f8_e4m3x2 result;
|
||||
result.data[0] = static_cast<__fp8_e4m3>(v.data[0]);
|
||||
result.data[1] = static_cast<__fp8_e4m3>(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // defined(__FP8_TYPES_EXIST__)
|
||||
|
||||
// --- fp8_e4m3b15 <-> fp16 direct conversion specializations ---
|
||||
// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift
|
||||
// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits
|
||||
// vs fp8's 3. fp32 conversions are derived by routing through fp16.
|
||||
|
||||
/// f8_e4m3b15x2 -> f16x2.
|
||||
/// Direct fp8 -> fp16 via branch-free bit manipulation.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f16x2 to<f16x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint16_t in = v.storage.__x;
|
||||
// Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5.
|
||||
uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
|
||||
uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
|
||||
uint32_t out0 = b0 | (a0 & 0x80008000u);
|
||||
__half2 h;
|
||||
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h)) : "r"(out0));
|
||||
return h;
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
// gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[].
|
||||
uint16_t in = v.storage.__x;
|
||||
uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24);
|
||||
uint32_t b0 = (a0 & 0x7f007f00u) >> 1;
|
||||
uint32_t out0 = b0 | (a0 & 0x80008000u);
|
||||
f16x2 result;
|
||||
result.words[0] = out0;
|
||||
return result;
|
||||
#else
|
||||
f16x2 result;
|
||||
result.data[0] = __float2half(float(v.data[0]));
|
||||
result.data[1] = __float2half(float(v.data[1]));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f8_e4m3b15x4 -> f16x4.
|
||||
/// Uses __byte_perm + lop3 for branch-free vectorized conversion.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f16x4 to<f16x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint32_t in = v.storage.__x;
|
||||
uint32_t a0 = __byte_perm(0u, in, 0x5746u);
|
||||
uint32_t a0_shr = a0 >> 1;
|
||||
uint32_t a0_sign = a0 & 0x80008000u;
|
||||
uint32_t out0;
|
||||
asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign));
|
||||
uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
|
||||
uint32_t a1_shr = a1 >> 1;
|
||||
uint32_t a1_sign = a1 & 0x80008000u;
|
||||
uint32_t out1;
|
||||
asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign));
|
||||
f16x4 result;
|
||||
asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0));
|
||||
asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1));
|
||||
return result;
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
// gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[].
|
||||
uint32_t in = v.storage.__x;
|
||||
uint32_t a0 = __byte_perm(0u, in, 0x5746u);
|
||||
uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u);
|
||||
uint32_t a1 = __byte_perm(a0, 0u, 0x2301u);
|
||||
uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u);
|
||||
f16x4 result;
|
||||
result.words[0] = out0;
|
||||
result.words[1] = out1;
|
||||
return result;
|
||||
#else
|
||||
f16x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = __float2half(float(v.data[i]));
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f16x2 -> f8_e4m3b15x2.
|
||||
/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint32_t in0;
|
||||
asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
|
||||
// Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
|
||||
uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
|
||||
uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
|
||||
alo = alo < 0x3B80u ? alo : 0x3B80u;
|
||||
ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
|
||||
uint32_t a0 = alo | (ahi << 16);
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
|
||||
return bit_cast<f8_e4m3b15x2>(packed);
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
// gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack.
|
||||
uint32_t in0 = v.words[0];
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu;
|
||||
uint32_t a0;
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
|
||||
return bit_cast<f8_e4m3b15x2>(packed);
|
||||
#else
|
||||
f8_e4m3b15x2 result;
|
||||
result.data[0] = __fp8_e4m3b15(__half2float(v.data[0]));
|
||||
result.data[1] = __fp8_e4m3b15(__half2float(v.data[1]));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f16x4 -> f8_e4m3b15x4.
|
||||
/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint32_t in0, in1;
|
||||
asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0]));
|
||||
asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu;
|
||||
uint32_t abs1 = in1 & 0x7fff7fffu;
|
||||
uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
|
||||
uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
a1 = a1 * 2u + 0x00800080u;
|
||||
uint32_t b0, b1;
|
||||
asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u));
|
||||
asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u));
|
||||
uint32_t packed = __byte_perm(b0, b1, 0x7531u);
|
||||
return bit_cast<f8_e4m3b15x4>(packed);
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
// gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack.
|
||||
uint32_t in0 = v.words[0], in1 = v.words[1];
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
|
||||
uint32_t a0, a1;
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
a1 = a1 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
uint32_t b1 = a1 | (in1 & 0x80008000u);
|
||||
uint32_t packed = __byte_perm(b0, b1, 0x7531u);
|
||||
return bit_cast<f8_e4m3b15x4>(packed);
|
||||
#else
|
||||
f8_e4m3b15x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = __fp8_e4m3b15(__half2float(v.data[i]));
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) ---
|
||||
|
||||
/// f8_e4m3b15x2 -> f32x2.
|
||||
/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x2 to<f32x2, f8_e4m3b15x2>(const f8_e4m3b15x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
|
||||
float2 f2 = __half22float2(h);
|
||||
return bit_cast<f32x2>(f2);
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
f16x2 h = to<f16x2, f8_e4m3b15x2>(v);
|
||||
f32x2 result;
|
||||
result.data[0] = __half2float(h.data[0]);
|
||||
result.data[1] = __half2float(h.data[1]);
|
||||
return result;
|
||||
#else
|
||||
f32x2 result;
|
||||
result.data[0] = float(v.data[0]);
|
||||
result.data[1] = float(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f8_e4m3b15x4 -> f32x4.
|
||||
/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f32x4 to<f32x4, f8_e4m3b15x4>(const f8_e4m3b15x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
|
||||
__half2 h0, h1;
|
||||
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h0)) : "r"(h.words[0]));
|
||||
asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast<uint32_t*>(&h1)) : "r"(h.words[1]));
|
||||
float2 f0 = __half22float2(h0);
|
||||
float2 f1 = __half22float2(h1);
|
||||
f32x4 result;
|
||||
result.data[0] = f0.x;
|
||||
result.data[1] = f0.y;
|
||||
result.data[2] = f1.x;
|
||||
result.data[3] = f1.y;
|
||||
return result;
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
f16x4 h = to<f16x4, f8_e4m3b15x4>(v);
|
||||
f32x4 result;
|
||||
result.data[0] = __half2float(h.data[0]);
|
||||
result.data[1] = __half2float(h.data[1]);
|
||||
result.data[2] = __half2float(h.data[2]);
|
||||
result.data[3] = __half2float(h.data[3]);
|
||||
return result;
|
||||
#else
|
||||
f32x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = float(v.data[i]);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f32x2 -> f8_e4m3b15x2.
|
||||
/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f32x2>(const f32x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
float2 f2 = {v.data[0], v.data[1]};
|
||||
__half2 h = __float22half2_rn(f2);
|
||||
return to<f8_e4m3b15x2, f16x2>(h);
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
f16x2 h;
|
||||
h.data[0] = __float2half_rn(v.data[0]);
|
||||
h.data[1] = __float2half_rn(v.data[1]);
|
||||
return to<f8_e4m3b15x2, f16x2>(h);
|
||||
#else
|
||||
f8_e4m3b15x2 result;
|
||||
result.data[0] = __fp8_e4m3b15(v.data[0]);
|
||||
result.data[1] = __fp8_e4m3b15(v.data[1]);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// f32x4 -> f8_e4m3b15x4.
|
||||
/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f32x4>(const f32x4& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
float2 f01 = {v.data[0], v.data[1]};
|
||||
float2 f23 = {v.data[2], v.data[3]};
|
||||
__half2 h01 = __float22half2_rn(f01);
|
||||
__half2 h23 = __float22half2_rn(f23);
|
||||
f16x4 h;
|
||||
asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast<uint32_t*>(&h01)));
|
||||
asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast<uint32_t*>(&h23)));
|
||||
return to<f8_e4m3b15x4, f16x4>(h);
|
||||
#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__)
|
||||
f16x4 h;
|
||||
h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1]));
|
||||
h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3]));
|
||||
return to<f8_e4m3b15x4, f16x4>(h);
|
||||
#else
|
||||
f8_e4m3b15x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = __fp8_e4m3b15(v.data[i]);
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --- fp8_e4m3b15 arithmetic (software, always available) ---
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
|
||||
return __fp8_e4m3b15(float(a) + float(b));
|
||||
}
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
|
||||
f8_e4m3b15x2 result;
|
||||
result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0]));
|
||||
result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1]));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <bool UseClip = true>
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
|
||||
f8_e4m3b15x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// --- fp8_e4m3b15 min (software) ---
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) {
|
||||
return __fp8_e4m3b15(fminf(float(a), float(b)));
|
||||
}
|
||||
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) {
|
||||
f8_e4m3b15x2 result;
|
||||
result.data[0] = mscclpp::min(a.data[0], b.data[0]);
|
||||
result.data[1] = mscclpp::min(a.data[1], b.data[1]);
|
||||
return result;
|
||||
}
|
||||
|
||||
MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) {
|
||||
f8_e4m3b15x4 result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
result.data[i] = mscclpp::min(a.data[i], b.data[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // MSCCLPP_DEVICE_COMPILE
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ namespace mscclpp {
|
||||
class Host2DeviceSemaphore {
|
||||
private:
|
||||
Semaphore semaphore_;
|
||||
std::shared_ptr<uint64_t> inboundToken_;
|
||||
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
|
||||
std::unique_ptr<uint64_t> outboundToken_;
|
||||
|
||||
@@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
|
||||
/// @param connection The connection associated with this semaphore.
|
||||
Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);
|
||||
|
||||
/// Destructor.
|
||||
~Host2DeviceSemaphore();
|
||||
|
||||
/// Move constructor.
|
||||
Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;
|
||||
|
||||
/// Move assignment operator.
|
||||
Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;
|
||||
|
||||
/// Returns the connection.
|
||||
/// @return The connection associated with this semaphore.
|
||||
Connection& connection();
|
||||
@@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore {
|
||||
private:
|
||||
Semaphore semaphore_;
|
||||
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
|
||||
detail::UniqueGpuPtr<uint64_t> outboundToken_;
|
||||
|
||||
public:
|
||||
/// Constructor.
|
||||
|
||||
@@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
|
||||
|
||||
/// Signal remote device, ensures prior memory ops complete.
|
||||
MSCCLPP_DEVICE_INLINE void signal() {
|
||||
auto outbound = incOutbound();
|
||||
#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800)
|
||||
// Using memoryOrderSeqCst is faster for A100.
|
||||
atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst);
|
||||
#else
|
||||
atomicStore(remoteInboundToken, outbound, memoryOrderRelease);
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
|
||||
#elif defined(MSCCLPP_DEVICE_HIP)
|
||||
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data.
|
||||
MSCCLPP_DEVICE_INLINE void relaxedSignal() {
|
||||
auto outbound = incOutbound();
|
||||
atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed);
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory");
|
||||
#elif defined(MSCCLPP_DEVICE_HIP)
|
||||
(void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Thread-safe read of expected inbound value.
|
||||
@@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle {
|
||||
return atomicLoad<uint64_t, scopeSystem>(inboundToken, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Thread-safe read of outbound value.
|
||||
/// @return The outbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {
|
||||
return atomicLoad<uint64_t, scopeDevice>(outboundToken, memoryOrderRelaxed);
|
||||
}
|
||||
|
||||
/// Thread-safe increment of outbound value.
|
||||
/// @return The incremented outbound value.
|
||||
MSCCLPP_DEVICE_INLINE uint64_t incOutbound() {
|
||||
return atomicFetchAdd<uint64_t, scopeDevice>(outboundToken, 1, memoryOrderRelaxed) + 1;
|
||||
}
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
/// A local memory space where the remote device will write its semaphore value and the local device will read it.
|
||||
uint64_t* inboundToken;
|
||||
|
||||
/// A local memory space where the local device stores the semaphore value to be written to the remote device.
|
||||
uint64_t* outboundToken;
|
||||
|
||||
/// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the
|
||||
/// remote device.
|
||||
/// A remote memory space where the local device atomically increments. This is inboundToken of the remote device.
|
||||
uint64_t* remoteInboundToken;
|
||||
|
||||
/// A local memory space where the local device stores the expected value of the inboundToken to wait for.
|
||||
|
||||
@@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build"
|
||||
name = "mscclpp"
|
||||
dynamic = ["version"]
|
||||
description = "MSCCL++ Python API"
|
||||
requires-python = ">=3.8"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"numpy",
|
||||
"blake3",
|
||||
"pybind11",
|
||||
"sortedcontainers",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
cuda11 = ["cupy-cuda11x"]
|
||||
cuda12 = ["cupy-cuda12x"]
|
||||
cuda13 = ["cupy-cuda13x"]
|
||||
rocm6 = ["cupy"]
|
||||
benchmark = [
|
||||
"mpi4py",
|
||||
"prettytable",
|
||||
"netifaces",
|
||||
"matplotlib",
|
||||
]
|
||||
test = [
|
||||
"pytest",
|
||||
"mpi4py",
|
||||
"netifaces",
|
||||
]
|
||||
|
||||
[tool.setuptools_scm]
|
||||
write_to = "python/mscclpp/_version.py"
|
||||
@@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF"
|
||||
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
target-version = ['py38']
|
||||
target-version = ['py310']
|
||||
include = '\.pyi?$'
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
@@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
|
||||
set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
|
||||
target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES})
|
||||
target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
|
||||
if(MSCCLPP_USE_ROCM)
|
||||
target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM)
|
||||
endif()
|
||||
install(TARGETS mscclpp_py LIBRARY DESTINATION .)
|
||||
|
||||
@@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) {
|
||||
[](Algorithm& self, std::shared_ptr<Communicator> comm, uintptr_t input, uintptr_t output,
|
||||
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream,
|
||||
std::shared_ptr<Executor> executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory,
|
||||
std::unordered_map<std::string, uintptr_t> extras) {
|
||||
std::unordered_map<std::string, uintptr_t> extras, int32_t accumDtype) {
|
||||
return self.execute(comm, reinterpret_cast<const void*>(input), reinterpret_cast<void*>(output),
|
||||
inputSize, outputSize, dtype, op, reinterpret_cast<cudaStream_t>(stream), executor,
|
||||
nBlocks, nThreadsPerBlock, symmetricMemory, extras);
|
||||
nBlocks, nThreadsPerBlock, symmetricMemory, extras,
|
||||
static_cast<DataType>(accumDtype));
|
||||
},
|
||||
nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"),
|
||||
nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr,
|
||||
nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false,
|
||||
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>())
|
||||
nb::arg("extras") = std::unordered_map<std::string, uintptr_t>(),
|
||||
nb::arg("accum_dtype") = static_cast<int32_t>(DataType::AUTO))
|
||||
.def("reset", &Algorithm::reset);
|
||||
|
||||
nb::class_<Algorithm::Constraint>(algorithmClass, "Constraint")
|
||||
|
||||
@@ -47,7 +47,8 @@ void register_core(nb::module_& m) {
|
||||
.value("bfloat16", DataType::BFLOAT16)
|
||||
.value("float8_e4m3", DataType::FLOAT8_E4M3)
|
||||
.value("float8_e5m2", DataType::FLOAT8_E5M2)
|
||||
.value("uint8", DataType::UINT8);
|
||||
.value("uint8", DataType::UINT8)
|
||||
.value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
|
||||
|
||||
nb::class_<Bootstrap>(m, "CppBootstrap")
|
||||
.def("get_rank", &Bootstrap::getRank)
|
||||
|
||||
@@ -28,6 +28,7 @@ void register_env(nb::module_& m) {
|
||||
.def_ro("force_nccl_fallback_operation", &Env::forceNcclFallbackOperation)
|
||||
.def_ro("nccl_symmetric_memory", &Env::ncclSymmetricMemory)
|
||||
.def_ro("force_disable_nvls", &Env::forceDisableNvls)
|
||||
.def_ro("force_disable_gdr", &Env::forceDisableGdr)
|
||||
.def_ro("ib_gid_index", &Env::ibGidIndex);
|
||||
|
||||
m.def("env", &env);
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/function.h>
|
||||
#include <nanobind/stl/shared_ptr.h>
|
||||
#include <nanobind/stl/string.h>
|
||||
#include <nanobind/stl/unordered_map.h>
|
||||
#include <nanobind/stl/vector.h>
|
||||
|
||||
|
||||
@@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) {
|
||||
return DLDataType{kDLBfloat, 16, 1};
|
||||
} else if (type == "torch.float16") {
|
||||
return DLDataType{kDLFloat, 16, 1};
|
||||
} else if (type == "torch.float8_e4m3fn") {
|
||||
return DLDataType{kDLFloat8_e4m3fn, 8, 1};
|
||||
} else if (type == "torch.float8_e4m3fnuz") {
|
||||
return DLDataType{kDLFloat8_e4m3fnuz, 8, 1};
|
||||
} else if (type == "torch.float8_e5m2") {
|
||||
return DLDataType{kDLFloat8_e5m2, 8, 1};
|
||||
} else if (type == "torch.float8_e5m2fnuz") {
|
||||
return DLDataType{kDLFloat8_e5m2fnuz, 8, 1};
|
||||
} else if (type == "torch.uint8") {
|
||||
return DLDataType{kDLUInt, 8, 1};
|
||||
} else if (type == "fp8_e4m3b15") {
|
||||
// No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes.
|
||||
return DLDataType{kDLUInt, 8, 1};
|
||||
} else {
|
||||
throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage);
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) {
|
||||
nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
|
||||
.def(nb::init<>())
|
||||
.def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
|
||||
.def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken)
|
||||
.def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
|
||||
.def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
|
||||
.def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
|
||||
|
||||
@@ -57,7 +57,7 @@ default_algo_configs = [
|
||||
|
||||
|
||||
def create_default_plans():
|
||||
plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default")
|
||||
plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
|
||||
plan_path = Path(plan_dir)
|
||||
if plan_path.exists():
|
||||
shutil.rmtree(plan_path)
|
||||
|
||||
@@ -177,6 +177,7 @@ class Algorithm:
|
||||
nthreads_per_block=0,
|
||||
symmetric_memory: bool = False,
|
||||
extras: Optional[Dict[str, int]] = None,
|
||||
accum_dtype: Optional[CppDataType] = None,
|
||||
) -> int:
|
||||
"""Execute the collective algorithm.
|
||||
|
||||
@@ -194,10 +195,14 @@ class Algorithm:
|
||||
nthreads_per_block: Number of threads per block (0 for auto-selection).
|
||||
symmetric_memory: Whether to use symmetric memory optimization (default: False).
|
||||
extras: Additional algorithm-specific parameters.
|
||||
accum_dtype: Data type for accumulation during reduction. If None, defaults to
|
||||
the same as dtype. Use DataType.float32 for high-precision FP8 accumulation.
|
||||
|
||||
Returns:
|
||||
The result code (0 for success).
|
||||
"""
|
||||
merged_extras = dict(extras) if extras is not None else {}
|
||||
accum_dtype = accum_dtype if accum_dtype is not None else dtype
|
||||
return self._algorithm.execute(
|
||||
comm,
|
||||
int(input_buffer),
|
||||
@@ -211,7 +216,8 @@ class Algorithm:
|
||||
nblocks,
|
||||
nthreads_per_block,
|
||||
symmetric_memory,
|
||||
extras if extras is not None else {},
|
||||
merged_extras,
|
||||
int(accum_dtype),
|
||||
)
|
||||
|
||||
def reset(self):
|
||||
|
||||
@@ -192,6 +192,9 @@ class NativeCodeCompiler:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._initialized = False
|
||||
|
||||
def _do_init(self):
|
||||
self._is_hip = cp.cuda.runtime.is_hip
|
||||
self._device_arch = get_device_arch()
|
||||
self._compiler = self._get_compiler()
|
||||
@@ -226,6 +229,7 @@ class NativeCodeCompiler:
|
||||
]
|
||||
self._cache_dir = Path(env().cache_dir) / "native"
|
||||
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._initialized = True
|
||||
|
||||
def _get_compiler(self) -> str:
|
||||
"""Get the path to the appropriate compiler.
|
||||
@@ -246,6 +250,8 @@ class NativeCodeCompiler:
|
||||
Returns:
|
||||
str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
|
||||
"""
|
||||
if not self._initialized:
|
||||
self._do_init()
|
||||
return self._device_arch
|
||||
|
||||
def __call__(self, name: str, file: str, **kwds):
|
||||
@@ -290,6 +296,8 @@ class NativeCodeCompiler:
|
||||
>>> # Use the module to create an algorithm
|
||||
>>> algo = module.create_allreduce_algorithm(comm, buffer, size)
|
||||
"""
|
||||
if not self._initialized:
|
||||
self._do_init()
|
||||
if not os.path.isfile(file):
|
||||
raise FileNotFoundError(f"The specified source file does not exist: {file}")
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ class MemoryChannel:
|
||||
|
||||
for tb_id in tb_list:
|
||||
tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type)
|
||||
tb_channel_ids = get_program().setup_channel(tb, self)
|
||||
tb_channel_ids = get_program().setup_channel(tb_id, self)
|
||||
op = GetOperation(
|
||||
src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)],
|
||||
dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)],
|
||||
|
||||
@@ -745,7 +745,7 @@ class ReduceOperation(BaseOperation):
|
||||
remote_dst_buff=self.remote_dst_buff + other.dst_buff,
|
||||
channel_ids=self.channel_ids,
|
||||
put_channel_ids=self.put_channel_ids + other.channel_ids,
|
||||
channel_type=self.channel_type,
|
||||
channel_type=other.channel_type,
|
||||
reduce_operation=self.reduce_operation,
|
||||
tbg_info=self.tbg_info,
|
||||
packet=self.packet,
|
||||
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -1,10 +1,10 @@
|
||||
mpi4py==4.1.1
|
||||
cupy==13.6.0
|
||||
mpi4py
|
||||
cupy
|
||||
prettytable
|
||||
netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -1,7 +1,7 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
|
||||
397
python/test/test_fp8_accum.py
Normal file
397
python/test/test_fp8_accum.py
Normal file
@@ -0,0 +1,397 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Correctness test for FP8 allreduce with different accumulation types.
|
||||
#
|
||||
# Verifies that FP8 allreduce with higher-precision accumulation produces
|
||||
# results at least as accurate as native FP8 accumulation, by comparing
|
||||
# against a float32 reference.
|
||||
#
|
||||
# Usage:
|
||||
# mpirun -np 8 pytest python/test/test_fp8_accum.py -v
|
||||
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported
|
||||
from mscclpp.ext import AlgorithmCollectionBuilder
|
||||
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
|
||||
# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
|
||||
# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
|
||||
_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
|
||||
_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
|
||||
pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def e4m3fn_to_float(uint8_array):
|
||||
"""Decode a cupy uint8 array of E4M3FN bit patterns to float32."""
|
||||
bits = uint8_array.astype(cp.int32)
|
||||
sign = (bits >> 7) & 1
|
||||
exp = (bits >> 3) & 0xF
|
||||
mant = bits & 0x7
|
||||
|
||||
# Normal: (-1)^s * 2^(exp-7) * (1 + mant/8)
|
||||
normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32))
|
||||
# Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8)
|
||||
subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6))
|
||||
|
||||
result = cp.where(exp == 0, subnormal_val, normal_val)
|
||||
result = cp.where(sign == 1, -result, result)
|
||||
# Zero
|
||||
result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
|
||||
# NaN: exp==15 & mant==7
|
||||
nan_mask = (exp == 15) & (mant == 7)
|
||||
result = cp.where(nan_mask, cp.float32(float("nan")), result)
|
||||
return result
|
||||
|
||||
|
||||
def float_to_e4m3fn(f32_array, chunk_size=65536):
|
||||
"""Encode a cupy float32 array to uint8 E4M3FN bit patterns.
|
||||
|
||||
Uses a lookup-table approach: precompute all 128 positive E4M3FN values,
|
||||
then find nearest match per element via chunked broadcast comparison.
|
||||
"""
|
||||
# Build lookup table of all 128 positive E4M3FN values (0x00..0x7F)
|
||||
all_bytes = cp.arange(128, dtype=cp.uint8)
|
||||
all_floats = e4m3fn_to_float(all_bytes) # (128,) float32
|
||||
# Mark NaN entries as inf so they're never selected as nearest
|
||||
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
|
||||
|
||||
# Clamp input and extract sign
|
||||
clamped = f32_array.astype(cp.float32)
|
||||
clamped = cp.clip(clamped, -448.0, 448.0)
|
||||
signs = (clamped < 0).astype(cp.uint8)
|
||||
absval = cp.abs(clamped)
|
||||
|
||||
result = cp.zeros(absval.shape, dtype=cp.uint8)
|
||||
n = absval.size
|
||||
absval_flat = absval.ravel()
|
||||
result_flat = result.ravel()
|
||||
|
||||
for start in range(0, n, chunk_size):
|
||||
end = min(start + chunk_size, n)
|
||||
chunk = absval_flat[start:end]
|
||||
# (chunk_size, 128) difference matrix
|
||||
diffs = cp.abs(chunk[:, None] - all_floats[None, :])
|
||||
result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
|
||||
|
||||
# Combine with sign bit
|
||||
result = result_flat.reshape(absval.shape)
|
||||
result = result | (signs << 7)
|
||||
# Handle exact zero
|
||||
result = cp.where(absval == 0, cp.uint8(0), result)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def e4m3b15_to_float(uint8_array):
|
||||
"""Decode a cupy uint8 array of E4M3B15 bit patterns to float32."""
|
||||
bits = uint8_array.astype(cp.int32)
|
||||
sign = (bits >> 7) & 1
|
||||
exp = (bits >> 3) & 0xF
|
||||
mant = bits & 0x7
|
||||
|
||||
# Normal: (-1)^s * 2^(exp-15) * (1 + mant/8)
|
||||
normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32))
|
||||
# Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8)
|
||||
subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14))
|
||||
|
||||
result = cp.where(exp == 0, subnormal_val, normal_val)
|
||||
result = cp.where(sign == 1, -result, result)
|
||||
# Zero
|
||||
result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
|
||||
# NaN: exp==15 or negative zero (0x80)
|
||||
nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
|
||||
result = cp.where(nan_mask, cp.float32(float("nan")), result)
|
||||
return result
|
||||
|
||||
|
||||
def float_to_e4m3b15(f32_array, chunk_size=65536):
|
||||
"""Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
|
||||
|
||||
Same lookup-table approach as float_to_e4m3fn.
|
||||
"""
|
||||
# Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
|
||||
all_bytes = cp.arange(128, dtype=cp.uint8)
|
||||
all_floats = e4m3b15_to_float(all_bytes) # (128,) float32
|
||||
# Mark NaN entries as inf so they're never selected as nearest
|
||||
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
|
||||
|
||||
# Clamp input and extract sign
|
||||
clamped = f32_array.astype(cp.float32)
|
||||
clamped = cp.clip(clamped, -0.9375, 0.9375)
|
||||
signs = (clamped < 0).astype(cp.uint8)
|
||||
absval = cp.abs(clamped)
|
||||
|
||||
result = cp.zeros(absval.shape, dtype=cp.uint8)
|
||||
n = absval.size
|
||||
absval_flat = absval.ravel()
|
||||
result_flat = result.ravel()
|
||||
|
||||
for start in range(0, n, chunk_size):
|
||||
end = min(start + chunk_size, n)
|
||||
chunk = absval_flat[start:end]
|
||||
# (chunk_size, 128) difference matrix
|
||||
diffs = cp.abs(chunk[:, None] - all_floats[None, :])
|
||||
result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
|
||||
|
||||
# Combine with sign bit
|
||||
result = result_flat.reshape(absval.shape)
|
||||
result = result | (signs << 7)
|
||||
# Handle exact zero
|
||||
result = cp.where(absval == 0, cp.uint8(0), result)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared test helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def setup_algorithms(mpi_group):
|
||||
"""Build default algorithms and return (comm_group, algo_map, scratch_buf)."""
|
||||
comm_group = CommGroup(mpi_group.comm)
|
||||
scratch = GpuBuffer(1 << 27, dtype=cp.uint8) # 128 MB
|
||||
AlgorithmCollectionBuilder.reset()
|
||||
builder = AlgorithmCollectionBuilder()
|
||||
algorithms = builder.build_default_algorithms(
|
||||
scratch_buffer=scratch.data.ptr,
|
||||
scratch_buffer_size=scratch.nbytes,
|
||||
rank=comm_group.my_rank,
|
||||
)
|
||||
algo_map = {a.name: a for a in algorithms}
|
||||
return comm_group, algo_map, scratch
|
||||
|
||||
|
||||
def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0):
|
||||
"""Run allreduce in-place on buffer and return a copy of the result."""
|
||||
ret = algo.execute(
|
||||
comm=comm_group.communicator,
|
||||
input_buffer=buffer.data.ptr,
|
||||
output_buffer=buffer.data.ptr,
|
||||
input_size=buffer.nbytes,
|
||||
output_size=buffer.nbytes,
|
||||
dtype=dtype,
|
||||
op=ReduceOp.SUM,
|
||||
stream=cp.cuda.get_current_stream().ptr,
|
||||
nblocks=nblocks,
|
||||
nthreads_per_block=nthreads_per_block,
|
||||
symmetric_memory=True,
|
||||
accum_dtype=accum_dtype,
|
||||
)
|
||||
cp.cuda.Device().synchronize()
|
||||
assert ret == 0, f"Allreduce failed with error code {ret}"
|
||||
return buffer.copy()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: FP8 E4M3 accumulation correctness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@parametrize_mpi_groups(8)
|
||||
@pytest.mark.parametrize(
|
||||
"algo_name",
|
||||
[
|
||||
"default_allreduce_packet",
|
||||
"default_allreduce_nvls_packet",
|
||||
"default_allreduce_fullmesh",
|
||||
"default_allreduce_rsag_zero_copy",
|
||||
"default_allreduce_allpair_packet",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576])
|
||||
def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
"""Verify that FP8 E4M3 allreduce with higher-precision accumulation is at
|
||||
least as accurate as native FP8 accumulation, across all algorithm variants."""
|
||||
rank = mpi_group.comm.rank
|
||||
world_size = mpi_group.comm.size
|
||||
|
||||
comm_group, algo_map, scratch = setup_algorithms(mpi_group)
|
||||
if algo_name not in algo_map:
|
||||
pytest.skip(f"{algo_name} not available")
|
||||
if "nvls" in algo_name and not is_nvls_supported():
|
||||
pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
|
||||
algo = algo_map[algo_name]
|
||||
|
||||
buf = GpuBuffer(size, dtype=cp.uint8)
|
||||
|
||||
accum_configs = [
|
||||
("fp8_native", DataType.float8_e4m3),
|
||||
("float16", DataType.float16),
|
||||
("float32", DataType.float32),
|
||||
]
|
||||
|
||||
# rsag_zero_copy and fullmesh need explicit block/thread counts
|
||||
if "rsag" in algo_name:
|
||||
nb = max(1, min(32, size // (world_size * 32)))
|
||||
nt = 1024
|
||||
elif "fullmesh" in algo_name:
|
||||
nb = 35
|
||||
nt = 512
|
||||
else:
|
||||
nb = 0
|
||||
nt = 0
|
||||
|
||||
errors = {}
|
||||
for accum_label, accum_dtype in accum_configs:
|
||||
# Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
|
||||
rng = np.random.RandomState(42 + rank)
|
||||
src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
|
||||
src_f32 = cp.clip(src_f32, -240.0, 240.0)
|
||||
src_fp8 = float_to_e4m3fn(src_f32)
|
||||
|
||||
# Copy into symmetric buffer
|
||||
buf[:] = src_fp8
|
||||
cp.cuda.Device().synchronize()
|
||||
|
||||
# Run allreduce
|
||||
result = run_allreduce(
|
||||
algo,
|
||||
comm_group,
|
||||
buf,
|
||||
dtype=DataType.float8_e4m3,
|
||||
accum_dtype=accum_dtype,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
)
|
||||
result_f32 = e4m3fn_to_float(result)
|
||||
|
||||
# Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
|
||||
ref_f32 = cp.zeros(size, dtype=cp.float32)
|
||||
for r in range(world_size):
|
||||
rng_r = np.random.RandomState(42 + r)
|
||||
rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
|
||||
rank_data = cp.clip(rank_data, -240.0, 240.0)
|
||||
rank_data_fp8 = float_to_e4m3fn(rank_data)
|
||||
ref_f32 += e4m3fn_to_float(rank_data_fp8)
|
||||
|
||||
# Compute errors
|
||||
abs_err = cp.abs(result_f32 - ref_f32)
|
||||
mean_abs_err = float(cp.mean(abs_err))
|
||||
errors[accum_label] = mean_abs_err
|
||||
|
||||
# Reset between runs
|
||||
algo.reset()
|
||||
|
||||
# Higher-precision accumulation should be at least as accurate as native fp8
|
||||
assert (
|
||||
errors["float16"] <= errors["fp8_native"] + 1e-6
|
||||
), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})"
|
||||
assert (
|
||||
errors["float32"] <= errors["fp8_native"] + 1e-6
|
||||
), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: FP8 E4M3B15 accumulation correctness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@parametrize_mpi_groups(8)
|
||||
@pytest.mark.parametrize(
|
||||
"algo_name",
|
||||
[
|
||||
"default_allreduce_packet",
|
||||
"default_allreduce_nvls_packet",
|
||||
"default_allreduce_rsag_zero_copy",
|
||||
"default_allreduce_fullmesh",
|
||||
"default_allreduce_allpair_packet",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("size", [1024, 4096, 65536])
|
||||
def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
"""Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at
|
||||
least as accurate as native E4M3B15 accumulation."""
|
||||
rank = mpi_group.comm.rank
|
||||
world_size = mpi_group.comm.size
|
||||
|
||||
comm_group, algo_map, scratch = setup_algorithms(mpi_group)
|
||||
if algo_name not in algo_map:
|
||||
pytest.skip(f"{algo_name} not available")
|
||||
if "nvls" in algo_name and not is_nvls_supported():
|
||||
pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform")
|
||||
|
||||
algo = algo_map[algo_name]
|
||||
buf = GpuBuffer(size, dtype=cp.uint8)
|
||||
|
||||
accum_configs = [
|
||||
("e4m3b15_native", DataType.float8_e4m3b15),
|
||||
("float16", DataType.float16),
|
||||
("float32", DataType.float32),
|
||||
]
|
||||
|
||||
# rsag_zero_copy needs explicit block/thread counts, scaled to data size
|
||||
if "rsag" in algo_name:
|
||||
nb = max(1, min(32, size // (world_size * 32)))
|
||||
nt = 1024
|
||||
else:
|
||||
nb = 0
|
||||
nt = 0
|
||||
|
||||
errors = {}
|
||||
for accum_label, accum_dtype in accum_configs:
|
||||
# Generate deterministic per-rank random uint8 values in valid e4m3b15 range
|
||||
rng = np.random.RandomState(42 + rank)
|
||||
raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
|
||||
signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
|
||||
src_uint8 = raw | signs
|
||||
# Fix negative zero -> positive zero
|
||||
src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
|
||||
|
||||
# Copy into symmetric buffer
|
||||
buf[:] = src_uint8
|
||||
cp.cuda.Device().synchronize()
|
||||
|
||||
# Run allreduce
|
||||
result = run_allreduce(
|
||||
algo,
|
||||
comm_group,
|
||||
buf,
|
||||
dtype=DataType.float8_e4m3b15,
|
||||
accum_dtype=accum_dtype,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
)
|
||||
|
||||
# Decode result
|
||||
result_f32 = e4m3b15_to_float(result)
|
||||
|
||||
# Compute float32 reference
|
||||
ref_f32 = cp.zeros(size, dtype=cp.float32)
|
||||
for r in range(world_size):
|
||||
rng_r = np.random.RandomState(42 + r)
|
||||
raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
|
||||
signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
|
||||
bits_r = raw_r | signs_r
|
||||
bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
|
||||
ref_f32 += e4m3b15_to_float(bits_r)
|
||||
|
||||
# Clamp reference to e4m3b15 representable range
|
||||
ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
|
||||
|
||||
# Compute errors (only on valid entries)
|
||||
valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
|
||||
abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
|
||||
mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
|
||||
errors[accum_label] = mean_abs_err
|
||||
|
||||
algo.reset()
|
||||
|
||||
# Higher-precision accumulation should be at least as accurate as native
|
||||
assert (
|
||||
errors["float16"] <= errors["e4m3b15_native"] + 1e-8
|
||||
), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
|
||||
assert (
|
||||
errors["float32"] <= errors["e4m3b15_native"] + 1e-8
|
||||
), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})"
|
||||
@@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB)
|
||||
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
|
||||
target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
|
||||
target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
|
||||
if(MLX5_FOUND)
|
||||
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS})
|
||||
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSCCLPP_USE_GDRCOPY)
|
||||
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
|
||||
target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
|
||||
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
|
||||
endif()
|
||||
|
||||
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
|
||||
|
||||
@@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF
|
||||
CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output,
|
||||
size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op,
|
||||
cudaStream_t stream, std::shared_ptr<Executor>, int nBlocks, int nThreadsPerBlock,
|
||||
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
bool symmetricMemory, const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
DataType accumDtype) {
|
||||
if (accumDtype == DataType::AUTO) accumDtype = dtype;
|
||||
if (!initialized_) {
|
||||
initFunc_(comm);
|
||||
initialized_ = true;
|
||||
@@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr<Communicator> comm, const vo
|
||||
contexts_[ctxKey] = ctx;
|
||||
}
|
||||
return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks,
|
||||
nThreadsPerBlock, extras);
|
||||
nThreadsPerBlock, extras, accumDtype);
|
||||
}
|
||||
|
||||
const std::string& NativeAlgorithm::name() const { return name_; }
|
||||
@@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM
|
||||
|
||||
Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; }
|
||||
|
||||
void NativeAlgorithm::reset() {
|
||||
contexts_.clear();
|
||||
initialized_ = false;
|
||||
}
|
||||
void NativeAlgorithm::reset() { contexts_.clear(); }
|
||||
|
||||
void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName,
|
||||
std::shared_ptr<Algorithm> algorithm) {
|
||||
@@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; }
|
||||
CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream,
|
||||
std::shared_ptr<Executor> executor, int, int, bool,
|
||||
const std::unordered_map<std::string, uintptr_t>&) {
|
||||
const std::unordered_map<std::string, uintptr_t>&, DataType) {
|
||||
if (!executor) {
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute");
|
||||
}
|
||||
@@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
|
||||
plan_, stream);
|
||||
break;
|
||||
#endif
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize,
|
||||
DataType::FLOAT8_E4M3B15, plan_, stream);
|
||||
break;
|
||||
case DataType::INT32:
|
||||
case DataType::UINT32:
|
||||
executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <mscclpp/npkit/npkit.hpp>
|
||||
#endif
|
||||
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <sstream>
|
||||
@@ -197,45 +198,54 @@ void IBConnection::recvThreadFunc() {
|
||||
}
|
||||
}
|
||||
|
||||
// Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
|
||||
uint32_t lastImmData = 0;
|
||||
uint64_t immHighBits = 0;
|
||||
uint64_t newValueHost = 0;
|
||||
|
||||
while (!stopRecvThread_.load(std::memory_order_relaxed)) {
|
||||
auto qp = qp_.lock();
|
||||
if (!qp) break;
|
||||
auto qp = qp_.lock();
|
||||
if (!qp) return;
|
||||
|
||||
while (!stopRecvThread_.load(std::memory_order_relaxed)) {
|
||||
int wcNum = qp->pollRecvCq();
|
||||
if (wcNum < 0) {
|
||||
WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
|
||||
recvThreadErrorMsg_ = "pollRecvCq failed";
|
||||
recvThreadError_.store(true, std::memory_order_release);
|
||||
WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
int status = qp->getRecvWcStatus(i);
|
||||
if (status != static_cast<int>(WsStatus::Success)) {
|
||||
WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i));
|
||||
// Post another recv to replace the failed one
|
||||
qp->stageRecv(/*wrId=*/0);
|
||||
qp->postRecv();
|
||||
continue;
|
||||
// A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error).
|
||||
// All remaining WRs will also fail — no recovery without QP recreation. Exit the thread
|
||||
// and set the error flag so the main thread can detect it.
|
||||
recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i);
|
||||
recvThreadError_.store(true, std::memory_order_release);
|
||||
WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_);
|
||||
return;
|
||||
}
|
||||
|
||||
// The imm_data contains newValue (32-bit, extended to 64-bit)
|
||||
// Note: getRecvWcImmData already converts from network byte order via ntohl
|
||||
unsigned int immData = qp->getRecvWcImmData(i);
|
||||
newValueHost = static_cast<uint64_t>(immData);
|
||||
// Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value
|
||||
// using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits
|
||||
// are less than the previous value, the upper 32 bits must have incremented by 1.
|
||||
uint32_t immData = qp->getRecvWcImmData(i);
|
||||
if (immData < lastImmData) {
|
||||
immHighBits += (1ULL << 32);
|
||||
}
|
||||
lastImmData = immData;
|
||||
newValueHost = immHighBits | static_cast<uint64_t>(immData);
|
||||
|
||||
// Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
|
||||
uint64_t dstGpuAddr = remoteUpdateDstAddr_;
|
||||
if (dstGpuAddr != 0) {
|
||||
uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);
|
||||
|
||||
// Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
|
||||
MSCCLPP_CUDATHROW(
|
||||
cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));
|
||||
|
||||
INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
|
||||
")");
|
||||
// Forward the token to the semaphore's inbound token address via atomicStore
|
||||
// through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire.
|
||||
if (signalAddr_ != 0) {
|
||||
if (signalGdrMap_ && signalGdrMap_->valid()) {
|
||||
atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed);
|
||||
} else {
|
||||
// For HIP/ROCm.
|
||||
// NOTE: may need a fix in the future to ensure BAR1 mapping.
|
||||
*reinterpret_cast<volatile uint64_t*>(signalAddr_) = newValueHost;
|
||||
}
|
||||
}
|
||||
|
||||
// Post another recv for future messages
|
||||
@@ -250,60 +260,105 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
|
||||
: BaseConnection(context, localEndpoint),
|
||||
transport_(localEndpoint.transport()),
|
||||
remoteTransport_(remoteEndpoint.transport()),
|
||||
dummyAtomicSource_(std::make_unique<uint64_t>(0)),
|
||||
atomicSrc_(std::make_unique<uint64_t>(0)),
|
||||
ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
|
||||
gdrSignalForwarding_(false),
|
||||
stopRecvThread_(false),
|
||||
recvThreadError_(false),
|
||||
localGpuDeviceId_(localEndpoint.device().id),
|
||||
signalStream_(nullptr),
|
||||
remoteUpdateDstAddr_(0) {
|
||||
signalAddr_(0) {
|
||||
qp_ = getImpl(localEndpoint).ibQp_;
|
||||
qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
|
||||
qp_.lock()->rts();
|
||||
dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
|
||||
validateTransport(dummyAtomicSourceMem_, transport_);
|
||||
dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
|
||||
atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
|
||||
validateTransport(atomicSrcMem_, transport_);
|
||||
atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);
|
||||
|
||||
if (ibNoAtomic_) {
|
||||
// Create a CUDA stream for async memory copies
|
||||
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
|
||||
#if defined(MSCCLPP_USE_CUDA)
|
||||
// On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1.
|
||||
if (!gdrEnabled()) {
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage,
|
||||
"IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage());
|
||||
}
|
||||
gdrSignalForwarding_ = true;
|
||||
#endif // defined(MSCCLPP_USE_CUDA)
|
||||
|
||||
// Pre-post receive requests for incoming write-with-imm
|
||||
// On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200
|
||||
// NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct
|
||||
// routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available
|
||||
// on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace
|
||||
// aarch64), HostNoAtomic works without Data Direct.
|
||||
//
|
||||
// We cannot reliably detect the bridge at compile time or runtime, so we emit a warning
|
||||
// when the device is not a VF. If data corruption occurs, switching to VF devices with
|
||||
// Data Direct or using IbMode::Host with RDMA atomics will resolve it.
|
||||
{
|
||||
IbCtx* ibCtx = getImpl(*context).getIbContext(transport_);
|
||||
if (!ibCtx->isVirtualFunction()) {
|
||||
WARN(CONN,
|
||||
"IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption "
|
||||
"on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). "
|
||||
"Device ",
|
||||
ibCtx->getDevName(),
|
||||
" is not a VF. "
|
||||
"If you experience data corruption, use VF devices with Data Direct or IbMode::Host.");
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-post receive requests for incoming WRITE_WITH_IMM notifications.
|
||||
// The recv CQE guarantees the preceding data WRITE has been committed to GPU memory.
|
||||
auto qp = qp_.lock();
|
||||
int maxRecvWr = localEndpoint.config().ib.maxRecvWr;
|
||||
for (int i = 0; i < maxRecvWr; ++i) {
|
||||
qp->stageRecv(/*wrId=*/0);
|
||||
}
|
||||
qp->postRecv();
|
||||
// Start the background thread to poll recv CQ
|
||||
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode");
|
||||
// The recv thread is started later in startSignalForwarding() when the semaphore
|
||||
// provides the signal forwarding destination. This ensures the thread lifetime is
|
||||
// bounded by the GdrMap lifetime (created before start, destroyed after stop).
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode");
|
||||
} else {
|
||||
INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode");
|
||||
}
|
||||
}
|
||||
|
||||
IBConnection::~IBConnection() {
|
||||
if (ibNoAtomic_) {
|
||||
stopRecvThread_.store(true, std::memory_order_relaxed);
|
||||
if (recvThread_.joinable()) {
|
||||
recvThread_.join();
|
||||
}
|
||||
if (signalStream_ != nullptr) {
|
||||
// Synchronize stream to ensure all async copies are complete before destruction
|
||||
// Ignore errors during teardown (CUDA context may already be destroyed)
|
||||
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
|
||||
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
|
||||
}
|
||||
}
|
||||
}
|
||||
IBConnection::~IBConnection() { stopSignalForwarding(); }
|
||||
|
||||
Transport IBConnection::transport() const { return transport_; }
|
||||
|
||||
Transport IBConnection::remoteTransport() const { return remoteTransport_; }
|
||||
|
||||
void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
|
||||
remoteUpdateDstAddr_ = addr;
|
||||
INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
|
||||
bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; }
|
||||
|
||||
void IBConnection::startSignalForwarding(std::shared_ptr<uint64_t> mem) {
|
||||
// Set up the forwarding destination and GdrMap, then start the recv thread.
|
||||
// Order: set address → create GdrMap → start thread.
|
||||
signalAddr_ = reinterpret_cast<uint64_t>(mem.get());
|
||||
if (gdrSignalForwarding_) {
|
||||
signalGdrMap_ = std::make_unique<GdrMap>(std::move(mem), localGpuDeviceId_);
|
||||
}
|
||||
if (ibNoAtomic_) {
|
||||
stopRecvThread_.store(false, std::memory_order_relaxed);
|
||||
recvThread_ = std::thread([this]() { this->recvThreadFunc(); });
|
||||
}
|
||||
INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_);
|
||||
}
|
||||
|
||||
void IBConnection::stopSignalForwarding() {
|
||||
// Stop the recv thread, then tear down GdrMap and address.
|
||||
// Order: stop thread → destroy GdrMap → clear address.
|
||||
if (ibNoAtomic_) {
|
||||
stopRecvThread_.store(true, std::memory_order_relaxed);
|
||||
if (recvThread_.joinable()) {
|
||||
recvThread_.join();
|
||||
}
|
||||
}
|
||||
if (gdrSignalForwarding_) {
|
||||
signalGdrMap_.reset();
|
||||
}
|
||||
signalAddr_ = 0;
|
||||
INFO(CONN, "IBConnection stopSignalForwarding");
|
||||
}
|
||||
|
||||
void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
|
||||
@@ -356,25 +411,29 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
|
||||
*src = newValue;
|
||||
|
||||
if (ibNoAtomic_) {
|
||||
// Use RDMA write-with-imm instead of atomic operation
|
||||
// Send only newValue in imm_data (0-byte write)
|
||||
// The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
|
||||
|
||||
// Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
|
||||
// Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the
|
||||
// token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around
|
||||
// detection (tokens are monotonically increasing, so a decrease in the lower 32 bits
|
||||
// indicates the upper 32 bits incremented by 1).
|
||||
if (newValue <= oldValue) {
|
||||
WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue);
|
||||
} else if (newValue - oldValue >= (1ULL << 32)) {
|
||||
WARN(CONN,
|
||||
"IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue,
|
||||
" -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)");
|
||||
}
|
||||
unsigned int immData = static_cast<unsigned int>(newValue);
|
||||
|
||||
// Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
|
||||
qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
|
||||
/*size=*/0, /*wrId=*/0,
|
||||
/*srcOffset=*/0, /*dstOffset=*/0,
|
||||
/*signaled=*/true, /*immData=*/immData);
|
||||
qp_.lock()->postSend();
|
||||
INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
|
||||
INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue);
|
||||
} else {
|
||||
qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
|
||||
qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
|
||||
/*signaled=*/true);
|
||||
qp_.lock()->postSend();
|
||||
INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
|
||||
INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
|
||||
" -> ", newValue);
|
||||
}
|
||||
|
||||
@@ -388,6 +447,11 @@ void IBConnection::flush(int64_t timeoutUsec) {
|
||||
NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0);
|
||||
#endif
|
||||
|
||||
// Check if the recv thread has already reported an error (e.g., QP entered error state).
|
||||
if (recvThreadError_.load(std::memory_order_acquire)) {
|
||||
THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
|
||||
}
|
||||
|
||||
Timer timer;
|
||||
while (qp_.lock()->getNumSendCqItems()) {
|
||||
int wcNum = qp_.lock()->pollSendCq();
|
||||
|
||||
@@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
|
||||
}
|
||||
}
|
||||
|
||||
Context::Impl::Impl() {}
|
||||
|
||||
IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
|
||||
// Find IB context or create it
|
||||
auto it = ibContexts_.find(ibTransport);
|
||||
|
||||
@@ -47,11 +47,16 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
|
||||
if (config_.ib.gidIndex < 0) {
|
||||
config_.ib.gidIndex = env()->ibGidIndex;
|
||||
}
|
||||
|
||||
int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;
|
||||
|
||||
ibQp_ = contextImpl.getIbContext(config_.transport)
|
||||
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
|
||||
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
|
||||
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
|
||||
ibQpInfo_ = ibQp_->getInfo();
|
||||
} else if (config_.transport == Transport::Ethernet) {
|
||||
// Configuring Ethernet Interfaces
|
||||
@@ -74,6 +79,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
|
||||
if (AllIBTransports.has(config_.transport)) {
|
||||
ibLocal_ = false;
|
||||
it = detail::deserialize(it, ibQpInfo_);
|
||||
it = detail::deserialize(it, ibNoAtomic_);
|
||||
} else if (config_.transport == Transport::Ethernet) {
|
||||
it = detail::deserialize(it, socketAddress_);
|
||||
}
|
||||
@@ -103,6 +109,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
|
||||
detail::serialize(data, pimpl_->pidHash_);
|
||||
if (AllIBTransports.has(pimpl_->config_.transport)) {
|
||||
detail::serialize(data, pimpl_->ibQpInfo_);
|
||||
detail::serialize(data, pimpl_->ibNoAtomic_);
|
||||
} else if (pimpl_->config_.transport == Transport::Ethernet) {
|
||||
detail::serialize(data, pimpl_->socketAddress_);
|
||||
}
|
||||
|
||||
@@ -66,6 +66,7 @@ Env::Env()
|
||||
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
|
||||
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
|
||||
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
|
||||
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
|
||||
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", -1)) {}
|
||||
|
||||
std::shared_ptr<Env> env() {
|
||||
@@ -94,6 +95,7 @@ std::shared_ptr<Env> env() {
|
||||
logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation);
|
||||
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
|
||||
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
|
||||
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
|
||||
}
|
||||
return globalEnv;
|
||||
|
||||
@@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
|
||||
case DataType::FLOAT8_E5M2:
|
||||
// FP8 is not supported in CUDA execution kernel.
|
||||
break;
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
// fp8_e4m3b15 is a software type not supported in the CUDA execution kernel.
|
||||
break;
|
||||
case DataType::AUTO:
|
||||
// AUTO is a sentinel resolved before reaching this point; nothing to do.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
204
src/core/gdr.cc
Normal file
204
src/core/gdr.cc
Normal file
@@ -0,0 +1,204 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "gdr.hpp"
|
||||
|
||||
#if defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
#include <gdrapi.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
#ifndef GPU_PAGE_SHIFT
|
||||
#define GPU_PAGE_SHIFT 16
|
||||
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
||||
#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1))
|
||||
#endif
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
// GdrContext
|
||||
|
||||
class GdrContext {
|
||||
public:
|
||||
GdrContext();
|
||||
~GdrContext();
|
||||
|
||||
GdrContext(const GdrContext&) = delete;
|
||||
GdrContext& operator=(const GdrContext&) = delete;
|
||||
|
||||
GdrStatus status() const { return status_; }
|
||||
gdr_t handle() const { return handle_; }
|
||||
|
||||
private:
|
||||
GdrStatus status_;
|
||||
gdr_t handle_;
|
||||
};
|
||||
|
||||
static std::shared_ptr<GdrContext> gdrContext() {
|
||||
static auto instance = std::make_shared<GdrContext>();
|
||||
return instance;
|
||||
}
|
||||
|
||||
GdrStatus gdrStatus() { return gdrContext()->status(); }
|
||||
|
||||
bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
|
||||
|
||||
const char* gdrStatusMessage() {
|
||||
switch (gdrStatus()) {
|
||||
case GdrStatus::Ok:
|
||||
return "GDRCopy initialized successfully";
|
||||
case GdrStatus::NotBuilt:
|
||||
return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
|
||||
case GdrStatus::Disabled:
|
||||
return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
|
||||
case GdrStatus::DriverMissing:
|
||||
return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
|
||||
case GdrStatus::OpenFailed:
|
||||
return "gdr_open() failed; GDRCopy driver may be misconfigured";
|
||||
default:
|
||||
return "unknown GDRCopy status";
|
||||
}
|
||||
}
|
||||
|
||||
GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) {
|
||||
if (env()->forceDisableGdr) {
|
||||
INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR");
|
||||
status_ = GdrStatus::Disabled;
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto-detect: check if driver is available
|
||||
if (access("/dev/gdrdrv", F_OK) != 0) {
|
||||
INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy");
|
||||
status_ = GdrStatus::DriverMissing;
|
||||
return;
|
||||
}
|
||||
|
||||
handle_ = gdr_open();
|
||||
if (handle_ == nullptr) {
|
||||
INFO(GPU, "gdr_open() failed, disabling GDRCopy");
|
||||
status_ = GdrStatus::OpenFailed;
|
||||
return;
|
||||
}
|
||||
|
||||
status_ = GdrStatus::Ok;
|
||||
INFO(GPU, "GDRCopy initialized successfully");
|
||||
}
|
||||
|
||||
GdrContext::~GdrContext() {
|
||||
if (handle_ != nullptr) {
|
||||
gdr_close(handle_);
|
||||
handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// GdrMap::Impl — real implementation with GDRCopy
|
||||
|
||||
struct GdrMap::Impl {
|
||||
std::shared_ptr<GdrContext> ctx;
|
||||
std::shared_ptr<void> gpuMem;
|
||||
gdr_mh_t mh;
|
||||
void* barPtr;
|
||||
uint64_t* hostDstPtr;
|
||||
size_t mappedSize;
|
||||
};
|
||||
|
||||
GdrMap::GdrMap(std::shared_ptr<void> gpuMem, int deviceId) : pimpl_(std::make_unique<Impl>()) {
|
||||
pimpl_->ctx = gdrContext();
|
||||
pimpl_->gpuMem = std::move(gpuMem);
|
||||
pimpl_->mh = {};
|
||||
pimpl_->barPtr = nullptr;
|
||||
pimpl_->hostDstPtr = nullptr;
|
||||
pimpl_->mappedSize = 0;
|
||||
|
||||
// Ensure CUDA device context is active for gdr_pin_buffer
|
||||
CudaDeviceGuard deviceGuard(deviceId);
|
||||
|
||||
uint64_t gpuAddr = reinterpret_cast<uint64_t>(pimpl_->gpuMem.get());
|
||||
// Align to GPU page boundary and pin one page around the target address
|
||||
unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK;
|
||||
unsigned long pageOffset = gpuAddr - alignedAddr;
|
||||
pimpl_->mappedSize = GPU_PAGE_SIZE;
|
||||
|
||||
// Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal
|
||||
// ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is
|
||||
// not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via
|
||||
// system-scope acquire.
|
||||
int ret =
|
||||
gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh);
|
||||
if (ret != 0) {
|
||||
ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh);
|
||||
if (ret != 0) {
|
||||
THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr,
|
||||
". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap).");
|
||||
}
|
||||
}
|
||||
|
||||
ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize);
|
||||
if (ret != 0) {
|
||||
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
|
||||
THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr);
|
||||
}
|
||||
|
||||
pimpl_->hostDstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(pimpl_->barPtr) + pageOffset);
|
||||
|
||||
INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr);
|
||||
}
|
||||
|
||||
GdrMap::~GdrMap() {
|
||||
if (pimpl_) {
|
||||
if (pimpl_->barPtr != nullptr) {
|
||||
(void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize);
|
||||
}
|
||||
if (pimpl_->hostDstPtr != nullptr) {
|
||||
(void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; }
|
||||
|
||||
uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; }
|
||||
|
||||
void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); }
|
||||
|
||||
void GdrMap::copyFrom(void* dst, size_t size) const {
|
||||
gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#else // !defined(MSCCLPP_USE_GDRCOPY)
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
|
||||
|
||||
bool gdrEnabled() { return false; }
|
||||
|
||||
const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
|
||||
|
||||
// GdrMap::Impl — stub (no GDRCopy)
|
||||
|
||||
struct GdrMap::Impl {};
|
||||
|
||||
GdrMap::GdrMap(std::shared_ptr<void> /*gpuMem*/, int /*deviceId*/) {}
|
||||
|
||||
GdrMap::~GdrMap() = default;
|
||||
|
||||
bool GdrMap::valid() const { return false; }
|
||||
|
||||
uint64_t* GdrMap::hostPtr() const { return nullptr; }
|
||||
|
||||
void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {}
|
||||
|
||||
void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // !defined(MSCCLPP_USE_GDRCOPY)
|
||||
@@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) {
|
||||
UnixSocketServer::instance().unregisterFd(handle->posixFd.fd);
|
||||
::close(handle->posixFd.fd);
|
||||
}
|
||||
if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) {
|
||||
if (handle->fabric.allocHandle != 0) {
|
||||
cuMemRelease(handle->fabric.allocHandle);
|
||||
}
|
||||
}
|
||||
delete handle;
|
||||
}
|
||||
}
|
||||
@@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
|
||||
auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter);
|
||||
handle->typeFlags = GpuIpcMemHandle::Type::None;
|
||||
handle->posixFd.fd = -1;
|
||||
handle->fabric.allocHandle = {};
|
||||
|
||||
CUdeviceptr basePtr;
|
||||
size_t sz;
|
||||
@@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) {
|
||||
// FABRIC handle
|
||||
if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) ==
|
||||
CUDA_SUCCESS) {
|
||||
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr));
|
||||
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
|
||||
}
|
||||
|
||||
@@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
|
||||
handle->offsetFromBase = 0;
|
||||
handle->typeFlags = GpuIpcMemHandle::Type::None;
|
||||
handle->posixFd.fd = -1;
|
||||
handle->fabric.allocHandle = {};
|
||||
|
||||
// POSIX FD handle
|
||||
int fileDesc;
|
||||
@@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
|
||||
if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle,
|
||||
CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) {
|
||||
handle->typeFlags |= GpuIpcMemHandle::Type::Fabric;
|
||||
handle->fabric.allocHandle = allocHandle;
|
||||
}
|
||||
|
||||
if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
|
||||
@@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
|
||||
THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
|
||||
}
|
||||
|
||||
// Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
|
||||
// multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
|
||||
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
|
||||
// Only release allocHandle if it is not stored in fabric.allocHandle.
|
||||
if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) {
|
||||
MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
|
||||
}
|
||||
return handle;
|
||||
#else // !(CUDA_NVLS_API_AVAILABLE)
|
||||
THROW(GPU, Error, ErrorCode::InvalidUsage,
|
||||
@@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
|
||||
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
|
||||
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
|
||||
CUDA_SUCCESS) {
|
||||
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
|
||||
handle_.fabric.allocHandle = {};
|
||||
type_ = GpuIpcMemHandle::Type::Fabric;
|
||||
}
|
||||
}
|
||||
|
||||
120
src/core/ib.cc
120
src/core/ib.cc
@@ -21,6 +21,9 @@
|
||||
#include "context.hpp"
|
||||
#if defined(USE_IBVERBS)
|
||||
#include "ibverbs_wrapper.hpp"
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
#include "mlx5dv_wrapper.hpp"
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
#endif // defined(USE_IBVERBS)
|
||||
#include "logger.hpp"
|
||||
|
||||
@@ -64,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) {
|
||||
IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) {
|
||||
if (size == 0) {
|
||||
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0");
|
||||
}
|
||||
@@ -80,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
|
||||
bool isGpuBuff = (gpuId != -1);
|
||||
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
|
||||
#if !defined(MSCCLPP_USE_ROCM)
|
||||
int fd;
|
||||
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
int fd = -1;
|
||||
size_t rangeSize = pages * pageSize;
|
||||
|
||||
// Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU
|
||||
// bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag
|
||||
// routes DMA through the Data Direct engine for correct ordering and higher throughput.
|
||||
// Fall back to the default (non-PCIe) mapping if the flag is unsupported.
|
||||
#if (CUDA_VERSION >= 12030)
|
||||
CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
|
||||
CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
|
||||
if (cuRes != CUDA_SUCCESS || fd < 0) {
|
||||
if (fd >= 0) ::close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
bool usedPcieFlag = (fd >= 0);
|
||||
#endif // CUDA_VERSION >= 12030
|
||||
if (fd < 0) {
|
||||
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
}
|
||||
|
||||
// Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API
|
||||
// which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs.
|
||||
size_t offsetInDmaBuf = buffIntPtr % pageSize;
|
||||
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
|
||||
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
|
||||
int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
|
||||
IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isDataDirect) {
|
||||
mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
#endif
|
||||
if (mr_ == nullptr) {
|
||||
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
|
||||
// If MR registration failed with a PCIe-mapped fd, retry with the default mapping.
|
||||
#if (CUDA_VERSION >= 12030)
|
||||
if (mr_ == nullptr && usedPcieFlag) {
|
||||
::close(fd);
|
||||
MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags);
|
||||
}
|
||||
#endif // CUDA_VERSION >= 12030
|
||||
|
||||
::close(fd);
|
||||
if (mr_ == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")");
|
||||
@@ -131,7 +171,7 @@ const void* IbMr::getBuff() const { return buff_; }
|
||||
uint32_t IbMr::getLkey() const { return mr_->lkey; }
|
||||
|
||||
IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum,
|
||||
int maxSendWr, int maxRecvWr, int maxWrPerSend)
|
||||
int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic)
|
||||
: portNum_(portNum),
|
||||
gidIndex_(gidIndex),
|
||||
info_(),
|
||||
@@ -151,7 +191,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
|
||||
maxSendCqPollNum_(maxSendCqPollNum),
|
||||
maxSendWr_(maxSendWr),
|
||||
maxWrPerSend_(maxWrPerSend),
|
||||
maxRecvWr_(maxRecvWr) {
|
||||
maxRecvWr_(maxRecvWr),
|
||||
noAtomic_(noAtomic) {
|
||||
sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0);
|
||||
if (sendCq_ == nullptr) {
|
||||
THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")");
|
||||
@@ -211,7 +252,8 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendC
|
||||
qpAttr.qp_state = IBV_QPS_INIT;
|
||||
qpAttr.pkey_index = 0;
|
||||
qpAttr.port_num = portNum_;
|
||||
qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC;
|
||||
qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE
|
||||
: (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
|
||||
if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
|
||||
THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")");
|
||||
}
|
||||
@@ -240,7 +282,7 @@ void IbQp::rtr(const IbQpInfo& info) {
|
||||
qp_attr.path_mtu = static_cast<ibv_mtu>(info.mtu);
|
||||
qp_attr.dest_qp_num = info.qpn;
|
||||
qp_attr.rq_psn = 0;
|
||||
qp_attr.max_dest_rd_atomic = 1;
|
||||
qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1;
|
||||
qp_attr.min_rnr_timer = 0x12;
|
||||
if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) {
|
||||
qp_attr.ah_attr.is_global = 1;
|
||||
@@ -272,7 +314,7 @@ void IbQp::rts() {
|
||||
qp_attr.retry_cnt = 7;
|
||||
qp_attr.rnr_retry = 7;
|
||||
qp_attr.sq_psn = 0;
|
||||
qp_attr.max_rd_atomic = 1;
|
||||
qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1;
|
||||
int ret = IBVerbs::ibv_modify_qp(
|
||||
qp_, &qp_attr,
|
||||
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
|
||||
@@ -434,12 +476,38 @@ std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_
|
||||
|
||||
unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); }
|
||||
|
||||
IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) {
|
||||
IbCtx::IbCtx(const std::string& devName)
|
||||
: devName_(devName),
|
||||
ctx_(nullptr),
|
||||
pd_(nullptr),
|
||||
supportsRdmaAtomics_(false),
|
||||
isMlx5_(false),
|
||||
isDataDirect_(false),
|
||||
isVF_(false) {
|
||||
int num;
|
||||
struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
|
||||
for (int i = 0; i < num; ++i) {
|
||||
if (std::string(devices[i]->name) == devName_) {
|
||||
ctx_ = IBVerbs::ibv_open_device(devices[i]);
|
||||
|
||||
// Detect if this IB device is a Virtual Function (VF).
|
||||
// VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not.
|
||||
{
|
||||
std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn";
|
||||
isVF_ = (access(physfnPath.c_str(), F_OK) == 0);
|
||||
if (isVF_) {
|
||||
INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)");
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (MLX5DV::isAvailable()) {
|
||||
isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]);
|
||||
if (isMlx5_) {
|
||||
INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs");
|
||||
}
|
||||
}
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -452,6 +520,20 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_
|
||||
THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")");
|
||||
}
|
||||
|
||||
// Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
if (isMlx5_ && MLX5DV::isAvailable()) {
|
||||
char sysfsPath[256];
|
||||
int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath));
|
||||
if (ret == 0) {
|
||||
isDataDirect_ = true;
|
||||
INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")");
|
||||
} else {
|
||||
INFO(NET, "IB device ", devName_, " does not support Data Direct");
|
||||
}
|
||||
}
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
// Query and cache RDMA atomics capability
|
||||
struct ibv_device_attr attr = {};
|
||||
if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) {
|
||||
@@ -512,7 +594,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const {
|
||||
}
|
||||
|
||||
std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
|
||||
int maxRecvWr, int maxWrPerSend) {
|
||||
int maxRecvWr, int maxWrPerSend, bool noAtomic) {
|
||||
if (port == -1) {
|
||||
port = this->getAnyUsablePort(gidIndex);
|
||||
if (port == -1) {
|
||||
@@ -521,16 +603,22 @@ std::shared_ptr<IbQp> IbCtx::createQp(int port, int gidIndex, int maxSendCqSize,
|
||||
} else if (!this->isPortUsable(port, gidIndex)) {
|
||||
THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port);
|
||||
}
|
||||
return std::shared_ptr<IbQp>(
|
||||
new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend));
|
||||
return std::shared_ptr<IbQp>(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr,
|
||||
maxRecvWr, maxWrPerSend, noAtomic));
|
||||
}
|
||||
|
||||
std::unique_ptr<const IbMr> IbCtx::registerMr(void* buff, std::size_t size) {
|
||||
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size));
|
||||
return std::unique_ptr<const IbMr>(new IbMr(pd_, buff, size, isDataDirect_));
|
||||
}
|
||||
|
||||
bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; }
|
||||
|
||||
bool IbCtx::isMlx5() const { return isMlx5_; }
|
||||
|
||||
bool IbCtx::isDataDirect() const { return isDataDirect_; }
|
||||
|
||||
bool IbCtx::isVirtualFunction() const { return isVF_; }
|
||||
|
||||
MSCCLPP_API_CPP int getIBDeviceCount() {
|
||||
int num;
|
||||
IBVerbs::ibv_get_device_list(&num);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#define MSCCLPP_CONNECTION_HPP_
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mutex>
|
||||
@@ -15,6 +16,7 @@
|
||||
#include "communicator.hpp"
|
||||
#include "context.hpp"
|
||||
#include "endpoint.hpp"
|
||||
#include "gdr.hpp"
|
||||
#include "ib.hpp"
|
||||
#include "registered_memory.hpp"
|
||||
#include "socket.h"
|
||||
@@ -35,11 +37,18 @@ class BaseConnection {
|
||||
|
||||
virtual void flush(int64_t timeoutUsec = -1) = 0;
|
||||
|
||||
/// Set the local address where remote updateAndSync operations should write.
|
||||
/// This is called by the receiver to specify where incoming signals should be written.
|
||||
/// Default implementation is a no-op for connections that don't need it.
|
||||
/// @param addr The local address for incoming writes.
|
||||
virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {}
|
||||
/// Start signal forwarding to the given memory address.
|
||||
/// Called by the semaphore to specify where incoming signals should be written.
|
||||
/// @param mem Shared pointer to the GPU memory for the signal token.
|
||||
virtual void startSignalForwarding(std::shared_ptr<uint64_t> /*mem*/) {}
|
||||
|
||||
/// Stop signal forwarding and release associated resources.
|
||||
virtual void stopSignalForwarding() {}
|
||||
|
||||
/// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode).
|
||||
/// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to.
|
||||
/// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
|
||||
virtual bool isSignalForwarding() const { return false; }
|
||||
|
||||
virtual Transport transport() const = 0;
|
||||
|
||||
@@ -91,22 +100,29 @@ class IBConnection : public BaseConnection {
|
||||
Transport transport_;
|
||||
Transport remoteTransport_;
|
||||
std::weak_ptr<IbQp> qp_;
|
||||
std::unique_ptr<uint64_t> dummyAtomicSource_; // not used anywhere but IB needs a source
|
||||
RegisteredMemory dummyAtomicSourceMem_;
|
||||
mscclpp::TransportInfo dstTransportInfo_;
|
||||
std::unique_ptr<uint64_t> atomicSrc_;
|
||||
RegisteredMemory atomicSrcMem_;
|
||||
mscclpp::TransportInfo atomicSrcTransportInfo_;
|
||||
|
||||
// For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal
|
||||
// instead of atomic operations, with a host thread forwarding to GPU for memory consistency.
|
||||
bool ibNoAtomic_;
|
||||
bool gdrSignalForwarding_; // ibNoAtomic_ && gdrEnabled() — decided once at construction
|
||||
std::thread recvThread_;
|
||||
std::atomic<bool> stopRecvThread_;
|
||||
int localGpuDeviceId_; // Local GPU device ID for setting CUDA context in recv thread
|
||||
cudaStream_t signalStream_;
|
||||
std::atomic<bool> recvThreadError_; // Set by recv thread on fatal error
|
||||
std::string recvThreadErrorMsg_; // Error message from recv thread (written before recvThreadError_ is set)
|
||||
int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping
|
||||
|
||||
// Write-with-imm design:
|
||||
// - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit)
|
||||
// - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write
|
||||
uint64_t remoteUpdateDstAddr_;
|
||||
// Signal forwarding design (HostNoAtomic mode):
|
||||
// - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data.
|
||||
// - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads
|
||||
// the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around
|
||||
// detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half
|
||||
// incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1.
|
||||
uint64_t signalAddr_;
|
||||
|
||||
std::unique_ptr<GdrMap> signalGdrMap_;
|
||||
|
||||
void recvThreadFunc();
|
||||
|
||||
@@ -114,10 +130,15 @@ class IBConnection : public BaseConnection {
|
||||
IBConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint);
|
||||
~IBConnection();
|
||||
|
||||
/// Set the local address where remote updateAndSync operations will write.
|
||||
/// Must be called before the remote sends any updateAndSync in host-no-atomic mode.
|
||||
/// @param addr The local address for incoming writes.
|
||||
void setRemoteUpdateDstAddr(uint64_t addr) override;
|
||||
/// Start signal forwarding to the given memory address.
|
||||
/// Must be called before the remote sends any updateAndSync in HostNoAtomic mode.
|
||||
/// @param mem Shared pointer to the GPU memory for the signal token.
|
||||
void startSignalForwarding(std::shared_ptr<uint64_t> mem) override;
|
||||
|
||||
/// Stop signal forwarding and release associated resources.
|
||||
void stopSignalForwarding() override;
|
||||
|
||||
bool isSignalForwarding() const override;
|
||||
|
||||
Transport transport() const override;
|
||||
|
||||
|
||||
@@ -42,8 +42,6 @@ struct Context::Impl {
|
||||
std::shared_ptr<TokenPool> tokenPool_;
|
||||
const size_t maxNumTokens_ = 1 << 15; // 32K tokens
|
||||
|
||||
Impl();
|
||||
|
||||
IbCtx* getIbContext(Transport ibTransport);
|
||||
std::shared_ptr<uint64_t> getToken();
|
||||
};
|
||||
|
||||
@@ -210,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input
|
||||
sizeof(int4);
|
||||
void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]);
|
||||
val = mscclpp::read<int4>(remoteMemory, srcOffset + idx);
|
||||
tmp = cal_vector<T, OpType>(tmp, val);
|
||||
tmp = calVector<T, OpType>(tmp, val);
|
||||
}
|
||||
output4[outputOffset4 + idx] = tmp;
|
||||
if constexpr (SendToRemote) {
|
||||
@@ -353,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in
|
||||
for (uint32_t index = 0; index < nSrcs; ++index) {
|
||||
PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
|
||||
PacketPayload<PacketType> val = pkt[idx].read(flag_);
|
||||
data = cal_vector<T, OpType>(data, val);
|
||||
data = calVector<T, OpType>(data, val);
|
||||
}
|
||||
data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
|
||||
data = calVector<T, OpType>(data, srcPacketPayload[idx]);
|
||||
dstPacketPayload[idx] = data;
|
||||
|
||||
if constexpr (SendToRemote) {
|
||||
@@ -394,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void
|
||||
for (uint32_t index = 0; index < nSrcs; ++index) {
|
||||
PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]);
|
||||
PacketPayload<PacketType> val = pkt[idx].read(flag_);
|
||||
data = cal_vector<T, OpType>(data, val);
|
||||
data = calVector<T, OpType>(data, val);
|
||||
}
|
||||
data = cal_vector<T, OpType>(data, srcPacketPayload[idx]);
|
||||
data = calVector<T, OpType>(data, srcPacketPayload[idx]);
|
||||
dstPacketPayload[idx] = data;
|
||||
PacketType* dst_val = &dstPkt[idx];
|
||||
dst_val->write(data, flag_);
|
||||
@@ -464,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo
|
||||
size_t buffOffset =
|
||||
(inputOffsets[index] + getOffset<ReuseScratch>(outputBufferRefs[index].type, offset)) / sizeof(int4);
|
||||
int4 val = buff4[buffOffset + idx];
|
||||
tmp = cal_vector<T, OpType>(tmp, val);
|
||||
tmp = calVector<T, OpType>(tmp, val);
|
||||
}
|
||||
dst4[dstOffset4 + idx] = tmp;
|
||||
if constexpr (SendToRemote) {
|
||||
@@ -899,6 +899,17 @@ class ExecutionKernel {
|
||||
#endif
|
||||
break;
|
||||
#endif // __FP8_TYPES_EXIST__
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
|
||||
rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize,
|
||||
plan, semaphores, localMemoryIdBegin, flag
|
||||
#if defined(ENABLE_NPKIT)
|
||||
,
|
||||
NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
|
||||
#else
|
||||
);
|
||||
#endif
|
||||
break;
|
||||
case DataType::UINT8:
|
||||
executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
|
||||
rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
|
||||
@@ -910,6 +921,10 @@ class ExecutionKernel {
|
||||
);
|
||||
#endif
|
||||
break;
|
||||
case DataType::AUTO:
|
||||
// AUTO is a sentinel that must be resolved before reaching this point.
|
||||
assert(false && "DataType::AUTO must be resolved before kernel launch");
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else // !defined(MSCCLPP_DEVICE_HIP)
|
||||
|
||||
62
src/core/include/gdr.hpp
Normal file
62
src/core/include/gdr.hpp
Normal file
@@ -0,0 +1,62 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#ifndef MSCCLPP_GDR_HPP_
|
||||
#define MSCCLPP_GDR_HPP_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
enum class GdrStatus {
|
||||
Ok, // GDRCopy initialized successfully
|
||||
NotBuilt, // Built without MSCCLPP_USE_GDRCOPY
|
||||
Disabled, // Disabled via MSCCLPP_FORCE_DISABLE_GDR
|
||||
DriverMissing, // /dev/gdrdrv not found
|
||||
OpenFailed, // gdr_open() failed
|
||||
};
|
||||
|
||||
/// Return the detailed status of the global GDRCopy context.
|
||||
GdrStatus gdrStatus();
|
||||
|
||||
/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok).
|
||||
bool gdrEnabled();
|
||||
|
||||
/// Return a human-readable error message for the current GDRCopy status.
|
||||
const char* gdrStatusMessage();
|
||||
|
||||
/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
|
||||
/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
|
||||
class GdrMap {
|
||||
public:
|
||||
/// Pin and map a GPU address for direct host-side access.
|
||||
/// @param gpuMem Shared pointer to the GPU memory (e.g. from gpuCallocShared).
|
||||
/// @param deviceId The CUDA device ID for setting context.
|
||||
GdrMap(std::shared_ptr<void> gpuMem, int deviceId);
|
||||
~GdrMap();
|
||||
|
||||
GdrMap(const GdrMap&) = delete;
|
||||
GdrMap& operator=(const GdrMap&) = delete;
|
||||
|
||||
/// Whether the mapping was established successfully.
|
||||
bool valid() const;
|
||||
|
||||
/// Return the BAR1-mapped host pointer to the GPU location.
|
||||
uint64_t* hostPtr() const;
|
||||
|
||||
/// Copy data from host memory to the mapped GPU location.
|
||||
void copyTo(const void* src, size_t size);
|
||||
|
||||
/// Copy data from the mapped GPU location to host memory.
|
||||
void copyFrom(void* dst, size_t size) const;
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_GDR_HPP_
|
||||
@@ -44,6 +44,7 @@ struct GpuIpcMemHandle {
|
||||
|
||||
struct {
|
||||
char handle[64];
|
||||
CUmemGenericAllocationHandle allocHandle;
|
||||
} fabric;
|
||||
|
||||
static void deleter(GpuIpcMemHandle* handle);
|
||||
|
||||
@@ -36,7 +36,7 @@ class IbMr {
|
||||
uint32_t getLkey() const;
|
||||
|
||||
private:
|
||||
IbMr(ibv_pd* pd, void* buff, std::size_t size);
|
||||
IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect);
|
||||
|
||||
ibv_mr* mr_;
|
||||
void* buff_;
|
||||
@@ -101,7 +101,7 @@ class IbQp {
|
||||
};
|
||||
|
||||
IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
|
||||
int maxRecvWr, int maxWrPerSend);
|
||||
int maxRecvWr, int maxWrPerSend, bool noAtomic);
|
||||
SendWrInfo getNewSendWrInfo();
|
||||
RecvWrInfo getNewRecvWrInfo();
|
||||
|
||||
@@ -128,6 +128,7 @@ class IbQp {
|
||||
const int maxSendWr_;
|
||||
const int maxWrPerSend_;
|
||||
const int maxRecvWr_;
|
||||
const bool noAtomic_;
|
||||
|
||||
friend class IbCtx;
|
||||
};
|
||||
@@ -139,18 +140,24 @@ class IbCtx {
|
||||
~IbCtx();
|
||||
|
||||
std::shared_ptr<IbQp> createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr,
|
||||
int maxRecvWr, int maxWrPerSend);
|
||||
int maxRecvWr, int maxWrPerSend, bool noAtomic);
|
||||
std::unique_ptr<const IbMr> registerMr(void* buff, std::size_t size);
|
||||
bool supportsRdmaAtomics() const;
|
||||
bool isMlx5() const;
|
||||
bool isDataDirect() const;
|
||||
bool isVirtualFunction() const;
|
||||
#else
|
||||
IbCtx([[maybe_unused]] const std::string& devName) {}
|
||||
~IbCtx() {}
|
||||
|
||||
std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int) { return nullptr; }
|
||||
std::shared_ptr<IbQp> createQp(int, int, int, int, int, int, int, bool) { return nullptr; }
|
||||
std::unique_ptr<const IbMr> registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) {
|
||||
return nullptr;
|
||||
}
|
||||
bool supportsRdmaAtomics() const { return false; }
|
||||
bool isMlx5() const { return false; }
|
||||
bool isDataDirect() const { return false; }
|
||||
bool isVirtualFunction() const { return false; }
|
||||
#endif
|
||||
|
||||
const std::string& getDevName() const { return devName_; };
|
||||
@@ -163,6 +170,9 @@ class IbCtx {
|
||||
ibv_context* ctx_;
|
||||
ibv_pd* pd_;
|
||||
bool supportsRdmaAtomics_;
|
||||
bool isMlx5_;
|
||||
bool isDataDirect_;
|
||||
bool isVF_;
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
38
src/core/include/mlx5dv_wrapper.hpp
Normal file
38
src/core/include/mlx5dv_wrapper.hpp
Normal file
@@ -0,0 +1,38 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_
|
||||
#define MSCCLPP_MLX5DV_WRAPPER_HPP_
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
struct MLX5DV {
|
||||
/// Whether libmlx5.so was successfully loaded at runtime.
|
||||
static bool isAvailable();
|
||||
|
||||
/// Check if the given IB device supports mlx5 Direct Verbs.
|
||||
static bool mlx5dv_is_supported(struct ibv_device* device);
|
||||
|
||||
/// Register a DMABUF memory region using mlx5dv extensions.
|
||||
/// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version.
|
||||
static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
|
||||
int access);
|
||||
|
||||
/// Query the Data Direct sysfs path for the given IB context.
|
||||
/// Returns 0 on success (device supports Data Direct), non-zero otherwise.
|
||||
static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len);
|
||||
|
||||
private:
|
||||
static void* dlsym(const std::string& symbol, bool allowReturnNull = false);
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
#endif // MSCCLPP_MLX5DV_WRAPPER_HPP_
|
||||
@@ -14,7 +14,7 @@ namespace mscclpp {
|
||||
|
||||
// Generic element-wise calculation helper
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
|
||||
MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) {
|
||||
if constexpr (OpType == SUM) {
|
||||
return a + b;
|
||||
} else if constexpr (OpType == MIN) {
|
||||
@@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) {
|
||||
}
|
||||
|
||||
// Generic vector reduction helpers
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) {
|
||||
int4 ret;
|
||||
ret.w = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
|
||||
ret.x = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
|
||||
ret.y = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
|
||||
ret.z = bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) {
|
||||
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) {
|
||||
uint2 ret;
|
||||
ret.x = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
|
||||
ret.y = bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
|
||||
ret.x = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.x), bit_cast<T, uint32_t>(b.x)));
|
||||
ret.y = bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a.y), bit_cast<T, uint32_t>(b.y)));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) {
|
||||
return bit_cast<int, T>(cal_elements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
|
||||
/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, SUM>(const uint2& a, const uint2& b) {
|
||||
f32x2 fa = bit_cast<f32x2, uint2>(a);
|
||||
f32x2 fb = bit_cast<f32x2, uint2>(b);
|
||||
f32x2 fr = fa + fb;
|
||||
return bit_cast<uint2, f32x2>(fr);
|
||||
}
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE uint2 calVectorHelper<f32x2, MIN>(const uint2& a, const uint2& b) {
|
||||
f32x2 fa = bit_cast<f32x2, uint2>(a);
|
||||
f32x2 fb = bit_cast<f32x2, uint2>(b);
|
||||
f32x2 fr = mscclpp::min(fa, fb);
|
||||
return bit_cast<uint2, f32x2>(fr);
|
||||
}
|
||||
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) {
|
||||
return bit_cast<uint32_t, T>(cal_elements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
|
||||
MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) {
|
||||
int4 ret;
|
||||
ret.w = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
|
||||
ret.x = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
|
||||
ret.y = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
|
||||
ret.z = bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper
|
||||
/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic.
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, SUM>(const int4& a, const int4& b) {
|
||||
uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
|
||||
uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
|
||||
uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
|
||||
uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
|
||||
uint2 lo_r = calVectorHelper<f32x2, SUM>(lo_a, lo_b);
|
||||
uint2 hi_r = calVectorHelper<f32x2, SUM>(hi_a, hi_b);
|
||||
return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
|
||||
}
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE int4 calVectorHelper<f32x2, MIN>(const int4& a, const int4& b) {
|
||||
uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y};
|
||||
uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w};
|
||||
uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y};
|
||||
uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w};
|
||||
uint2 lo_r = calVectorHelper<f32x2, MIN>(lo_a, lo_b);
|
||||
uint2 hi_r = calVectorHelper<f32x2, MIN>(hi_a, hi_b);
|
||||
return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y};
|
||||
}
|
||||
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) {
|
||||
return bit_cast<int, T>(calElements<T, OpType>(bit_cast<T, int>(a), bit_cast<T, int>(b)));
|
||||
}
|
||||
|
||||
template <typename T, ReduceOp OpType>
|
||||
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) {
|
||||
return bit_cast<uint32_t, T>(calElements<T, OpType>(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
|
||||
}
|
||||
|
||||
/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback).
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, SUM>(const uint32_t& a, const uint32_t& b) {
|
||||
float fa = bit_cast<float, uint32_t>(a);
|
||||
float fb = bit_cast<float, uint32_t>(b);
|
||||
return bit_cast<uint32_t, float>(fa + fb);
|
||||
}
|
||||
|
||||
template <>
|
||||
MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper<f32x2, MIN>(const uint32_t& a, const uint32_t& b) {
|
||||
float fa = bit_cast<float, uint32_t>(a);
|
||||
float fb = bit_cast<float, uint32_t>(b);
|
||||
return bit_cast<uint32_t, float>(fminf(fa, fb));
|
||||
}
|
||||
|
||||
// calVector wrapper – converts scalar types to vector types and calls calVectorHelper
|
||||
template <typename T, ReduceOp OpType, typename DataType>
|
||||
MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) {
|
||||
MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) {
|
||||
// Define the vectorized computation type based on the element type
|
||||
static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size");
|
||||
static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes");
|
||||
using CompType = typename std::conditional_t<
|
||||
std::is_same_v<T, __half>, f16x2,
|
||||
std::is_same_v<T, float>, f32x2,
|
||||
std::conditional_t<
|
||||
std::is_same_v<T, __bfloat16>, bf16x2,
|
||||
std::conditional_t<std::is_same_v<T, uint8_t>, u8x4,
|
||||
std::is_same_v<T, __half>, f16x2,
|
||||
std::conditional_t<
|
||||
std::is_same_v<T, __bfloat16>, bf16x2,
|
||||
std::conditional_t<
|
||||
std::is_same_v<T, uint8_t>, u8x4,
|
||||
std::conditional_t<std::is_same_v<T, __fp8_e4m3b15>, f8_e4m3b15x4,
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
|
||||
std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4,
|
||||
#endif
|
||||
T
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
>>>>>;
|
||||
std::conditional_t<std::is_same_v<T, __fp8_e4m3>, f8_e4m3x4,
|
||||
std::conditional_t<std::is_same_v<T, __fp8_e5m2>, f8_e5m2x4, T>>
|
||||
#else
|
||||
>>>;
|
||||
T
|
||||
#endif
|
||||
return cal_vector_helper<CompType, OpType>(a, b);
|
||||
>>>>>;
|
||||
return calVectorHelper<CompType, OpType>(a, b);
|
||||
}
|
||||
|
||||
/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements).
|
||||
/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics).
|
||||
/// When AccumT == T, this is a no-op identity.
|
||||
template <typename T, typename AccumT, typename AccDataType, typename DataType>
|
||||
MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) {
|
||||
if constexpr (std::is_same_v<T, AccumT>) {
|
||||
return val;
|
||||
} else {
|
||||
constexpr int nElems = sizeof(DataType) / sizeof(T);
|
||||
using FromVec = VectorType<T, nElems>;
|
||||
using ToVec = VectorType<AccumT, nElems>;
|
||||
ToVec result = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
|
||||
return reinterpret_cast<const AccDataType&>(result);
|
||||
}
|
||||
}
|
||||
|
||||
/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements).
|
||||
/// Uses the optimized to<>() specializations when available.
|
||||
/// When AccumT == T, this is a no-op identity.
|
||||
template <typename T, typename AccumT, typename DataType, typename AccDataType>
|
||||
MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) {
|
||||
if constexpr (std::is_same_v<T, AccumT>) {
|
||||
return val;
|
||||
} else {
|
||||
constexpr int nElems = sizeof(DataType) / sizeof(T);
|
||||
using FromVec = VectorType<T, nElems>;
|
||||
using ToVec = VectorType<AccumT, nElems>;
|
||||
FromVec result = mscclpp::to<FromVec>(reinterpret_cast<const ToVec&>(val));
|
||||
return reinterpret_cast<const DataType&>(result);
|
||||
}
|
||||
}
|
||||
|
||||
/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType).
|
||||
/// When AccumT == T, falls back to the standard calVector.
|
||||
/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator.
|
||||
template <typename T, typename AccumT, ReduceOp OpType, typename AccDataType, typename DataType>
|
||||
MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) {
|
||||
if constexpr (std::is_same_v<T, AccumT>) {
|
||||
return calVector<T, OpType>(acc, val);
|
||||
} else {
|
||||
constexpr int nElems = sizeof(DataType) / sizeof(T);
|
||||
using FromVec = VectorType<T, nElems>;
|
||||
using ToVec = VectorType<AccumT, nElems>;
|
||||
|
||||
ToVec fv = mscclpp::to<ToVec>(reinterpret_cast<const FromVec&>(val));
|
||||
const ToVec& fa = reinterpret_cast<const ToVec&>(acc);
|
||||
ToVec fr;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < nElems; ++i) {
|
||||
fr.data[i] = calElements<AccumT, OpType>(fa.data[i], fv.data[i]);
|
||||
}
|
||||
return reinterpret_cast<const AccDataType&>(fr);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
|
||||
126
src/core/mlx5dv_wrapper.cc
Normal file
126
src/core/mlx5dv_wrapper.cc
Normal file
@@ -0,0 +1,126 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#if defined(MSCCLPP_USE_MLX5DV)
|
||||
|
||||
// _GNU_SOURCE is required for dlvsym()
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "mlx5dv_wrapper.hpp"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <infiniband/mlx5dv.h>
|
||||
|
||||
#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT
|
||||
#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0)
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
static std::unique_ptr<void, int (*)(void*)> globalMLX5Handle(nullptr, &::dlclose);
|
||||
|
||||
void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
if (!globalMLX5Handle) {
|
||||
const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
|
||||
for (int i = 0; possibleLibNames[i] != nullptr; i++) {
|
||||
void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
|
||||
if (handle) {
|
||||
globalMLX5Handle.reset(handle);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!globalMLX5Handle) {
|
||||
if (allowReturnNull) return nullptr;
|
||||
THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror()));
|
||||
}
|
||||
}
|
||||
void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str());
|
||||
if (!ptr && !allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
bool MLX5DV::isAvailable() {
|
||||
static int available = -1;
|
||||
if (available == -1) {
|
||||
// Try to load the library; if it fails, mlx5dv is not available
|
||||
const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr};
|
||||
for (int i = 0; possibleLibNames[i] != nullptr; i++) {
|
||||
void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW);
|
||||
if (handle) {
|
||||
if (!globalMLX5Handle) {
|
||||
globalMLX5Handle.reset(handle);
|
||||
} else {
|
||||
::dlclose(handle);
|
||||
}
|
||||
available = 1;
|
||||
INFO(NET, "libmlx5 loaded successfully");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
available = 0;
|
||||
DEBUG(NET, "libmlx5 not available");
|
||||
}
|
||||
return available == 1;
|
||||
}
|
||||
|
||||
bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) {
|
||||
using FuncType = bool (*)(struct ibv_device*);
|
||||
static FuncType impl = nullptr;
|
||||
if (!impl) {
|
||||
void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true);
|
||||
if (!ptr) return false;
|
||||
impl = reinterpret_cast<FuncType>(ptr);
|
||||
}
|
||||
return impl(device);
|
||||
}
|
||||
|
||||
struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd,
|
||||
int access) {
|
||||
// mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags.
|
||||
// Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol.
|
||||
using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int);
|
||||
static FuncType impl = nullptr;
|
||||
static bool resolved = false;
|
||||
if (!resolved) {
|
||||
if (globalMLX5Handle) {
|
||||
void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25");
|
||||
if (!ptr) {
|
||||
ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true);
|
||||
}
|
||||
impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
|
||||
}
|
||||
resolved = true;
|
||||
}
|
||||
if (!impl) return nullptr;
|
||||
return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
|
||||
}
|
||||
|
||||
int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
|
||||
using FuncType = int (*)(struct ibv_context*, char*, size_t);
|
||||
static FuncType impl = nullptr;
|
||||
static bool resolved = false;
|
||||
if (!resolved) {
|
||||
if (globalMLX5Handle) {
|
||||
void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25");
|
||||
if (!ptr) {
|
||||
ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true);
|
||||
}
|
||||
impl = ptr ? reinterpret_cast<FuncType>(ptr) : nullptr;
|
||||
}
|
||||
resolved = true;
|
||||
}
|
||||
if (!impl) return -1;
|
||||
return impl(context, buf, buf_len);
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // defined(MSCCLPP_USE_MLX5DV)
|
||||
@@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() {
|
||||
else
|
||||
return 25000;
|
||||
#else
|
||||
cudaDeviceProp dev_prop;
|
||||
int clockRate;
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id));
|
||||
MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id));
|
||||
return dev_prop.clockRate;
|
||||
MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id));
|
||||
return clockRate;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
|
||||
}
|
||||
}
|
||||
} else if (transports.has(Transport::CudaIpc)) {
|
||||
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
|
||||
// try CudaIpc first and fall back to IB on failure.
|
||||
auto entry = getTransportInfo(Transport::CudaIpc);
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
bool hasIB = (transports & AllIBTransports).any();
|
||||
try {
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
} catch (const BaseError& e) {
|
||||
if (!hasIB) {
|
||||
throw;
|
||||
}
|
||||
bool isSameHost = (getHostHash() == this->hostHash);
|
||||
if (isSameHost) {
|
||||
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
|
||||
} else {
|
||||
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this->data != nullptr) {
|
||||
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "atomic.hpp"
|
||||
#include "connection.hpp"
|
||||
#include "context.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "registered_memory.hpp"
|
||||
#include "serialization.hpp"
|
||||
|
||||
@@ -48,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
|
||||
token_ = std::make_shared<uint64_t>(0);
|
||||
} else if (localDevice.type == DeviceType::GPU) {
|
||||
if (localDevice.id < 0) {
|
||||
throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided");
|
||||
}
|
||||
CudaDeviceGuard deviceGuard(localDevice.id);
|
||||
token_ = gpuCallocToken(connection_.context());
|
||||
} else {
|
||||
throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type");
|
||||
}
|
||||
idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport()));
|
||||
}
|
||||
@@ -78,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector<char>
|
||||
RegisteredMemory idMemory(std::make_shared<RegisteredMemory::Impl>(data.begin(), memEnd));
|
||||
auto it = detail::deserialize(memEnd, device);
|
||||
if (it != data.end()) {
|
||||
throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed");
|
||||
}
|
||||
return SemaphoreStub(std::make_shared<Impl>(std::move(idMemory), device));
|
||||
}
|
||||
@@ -119,15 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema
|
||||
expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
|
||||
outboundToken_(std::make_unique<uint64_t>()) {
|
||||
if (connection().localDevice().type != DeviceType::GPU) {
|
||||
throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU");
|
||||
}
|
||||
BaseConnection::getImpl(connection())
|
||||
->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
|
||||
auto connImpl = BaseConnection::getImpl(connection());
|
||||
if (connImpl->isSignalForwarding()) {
|
||||
// Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for
|
||||
// WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy.
|
||||
CudaDeviceGuard deviceGuard(connection().localDevice().id);
|
||||
#if defined(MSCCLPP_USE_ROCM)
|
||||
inboundToken_ = detail::gpuCallocUncachedShared<uint64_t>();
|
||||
#else
|
||||
inboundToken_ = detail::gpuCallocShared<uint64_t>();
|
||||
#endif
|
||||
connImpl->startSignalForwarding(inboundToken_);
|
||||
}
|
||||
// When isSignalForwarding() is false (atomic mode), inboundToken_ stays null
|
||||
// and the GPU polls the SemaphoreStub token directly (the NIC atomic target).
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection)
|
||||
: Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {}
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() {
|
||||
if (inboundToken_) {
|
||||
// Clear the connection's signal forwarding destination (and GdrMap)
|
||||
// before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory.
|
||||
BaseConnection::getImpl(connection())->stopSignalForwarding();
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); }
|
||||
|
||||
MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
|
||||
@@ -136,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() {
|
||||
|
||||
MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const {
|
||||
Host2DeviceSemaphore::DeviceHandle device;
|
||||
device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
|
||||
// If inboundToken_ is allocated (signal forwarding mode), the GPU polls it.
|
||||
// Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly,
|
||||
// which is the same address targeted by the NIC's atomic operation.
|
||||
device.inboundToken =
|
||||
inboundToken_ ? inboundToken_.get() : reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
|
||||
device.expectedInboundToken = expectedInboundToken_.get();
|
||||
return device;
|
||||
}
|
||||
@@ -146,13 +171,19 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor
|
||||
expectedInboundToken_(std::make_unique<uint64_t>()),
|
||||
outboundToken_(std::make_unique<uint64_t>()) {
|
||||
if (connection().transport() == Transport::CudaIpc) {
|
||||
throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport");
|
||||
}
|
||||
if (connection().localDevice().type != DeviceType::CPU) {
|
||||
throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU");
|
||||
}
|
||||
auto connImpl = BaseConnection::getImpl(connection());
|
||||
if (connImpl->isSignalForwarding()) {
|
||||
// Signal forwarding mode: tell the recv thread where to write the incoming token.
|
||||
// Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid.
|
||||
auto token =
|
||||
std::shared_ptr<uint64_t>(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), [](uint64_t*) {});
|
||||
connImpl->startSignalForwarding(std::move(token));
|
||||
}
|
||||
BaseConnection::getImpl(connection())
|
||||
->setRemoteUpdateDstAddr(reinterpret_cast<uint64_t>(semaphore_.localMemory().data()));
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection)
|
||||
@@ -177,17 +208,16 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
|
||||
while (atomicLoad(reinterpret_cast<uint64_t*>(semaphore_.localMemory().data()), memoryOrderAcquire) <
|
||||
(*expectedInboundToken_)) {
|
||||
if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
|
||||
throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
|
||||
THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore)
|
||||
: semaphore_(semaphore),
|
||||
expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()),
|
||||
outboundToken_(detail::gpuCallocUnique<uint64_t>()) {
|
||||
: semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique<uint64_t>()) {
|
||||
if (connection().localDevice().type != DeviceType::GPU) {
|
||||
throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage,
|
||||
"Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,7 +232,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe
|
||||
device.remoteInboundToken = reinterpret_cast<uint64_t*>(semaphore_.remoteMemory().data());
|
||||
device.inboundToken = reinterpret_cast<uint64_t*>(semaphore_.localMemory().data());
|
||||
device.expectedInboundToken = expectedInboundToken_.get();
|
||||
device.outboundToken = outboundToken_.get();
|
||||
return device;
|
||||
};
|
||||
|
||||
|
||||
@@ -183,7 +183,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh::build() {
|
||||
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
|
||||
[[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op,
|
||||
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras) -> CommResult {
|
||||
const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
[[maybe_unused]] DataType accumDtype) -> CommResult {
|
||||
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
|
||||
},
|
||||
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
|
||||
@@ -212,7 +212,8 @@ std::shared_ptr<Algorithm> AllgatherFullmesh2::build() {
|
||||
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
|
||||
[[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op,
|
||||
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
|
||||
const std::unordered_map<std::string, uintptr_t>& extras) -> mscclpp::CommResult {
|
||||
const std::unordered_map<std::string, uintptr_t>& extras,
|
||||
[[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult {
|
||||
return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras);
|
||||
},
|
||||
[self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <collective_utils.hpp>
|
||||
#include <type_traits>
|
||||
|
||||
#include "allreduce/allreduce_allpair_packet.hpp"
|
||||
#include "allreduce/common.hpp"
|
||||
@@ -11,7 +12,7 @@
|
||||
namespace mscclpp {
|
||||
namespace collective {
|
||||
|
||||
template <ReduceOp OpType, typename T>
|
||||
template <ReduceOp OpType, typename T, typename AccumT = T>
|
||||
__global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle<MemoryChannel>* memoryChannels,
|
||||
size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode,
|
||||
int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags,
|
||||
@@ -43,13 +44,16 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand
|
||||
// step 2: Reduce Data
|
||||
for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) {
|
||||
uint32_t data = src[idx];
|
||||
using AccRaw = std::conditional_t<std::is_same_v<T, AccumT>, uint32_t,
|
||||
mscclpp::VectorType<AccumT, sizeof(uint32_t) / sizeof(T)>>;
|
||||
AccRaw acc = mscclpp::upcastVector<T, AccumT, AccRaw>(data);
|
||||
for (int index = 0; index < nPeers; index++) {
|
||||
const int remoteRank = index < rank ? index : index + 1;
|
||||
LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems;
|
||||
uint32_t val = dstPkt[idx].read(flag, -1);
|
||||
data = cal_vector<T, OpType>(val, data);
|
||||
acc = mscclpp::calVectorAccum<T, AccumT, OpType, AccRaw>(acc, val);
|
||||
}
|
||||
dst[idx] = data;
|
||||
dst[idx] = mscclpp::downcastVector<T, AccumT, uint32_t>(acc);
|
||||
}
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0) {
|
||||
@@ -67,7 +71,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
|
||||
return {(worldSize - 1) * 4, 512};
|
||||
}
|
||||
|
||||
template <ReduceOp OpType, typename T>
|
||||
template <ReduceOp OpType, typename T, typename AccumT = T>
|
||||
struct AllpairAdapter {
|
||||
static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*,
|
||||
DeviceHandle<SwitchChannel>*, DeviceHandle<SwitchChannel>*, size_t channelInOffset, size_t,
|
||||
@@ -76,7 +80,12 @@ struct AllpairAdapter {
|
||||
int nThreadsPerBlock = 0) {
|
||||
using ChannelType = DeviceHandle<MemoryChannel>;
|
||||
const size_t nelems = inputSize / sizeof(T);
|
||||
allreduceAllPairs<OpType, T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
|
||||
// Round nBlocks to multiple of nPeers so every block maps to a valid peer.
|
||||
const int nPeers = worldSize - 1;
|
||||
if (nPeers > 0) {
|
||||
nBlocks = (nBlocks / nPeers) * nPeers;
|
||||
}
|
||||
allreduceAllPairs<OpType, T, AccumT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
|
||||
(T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank,
|
||||
nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize);
|
||||
return cudaGetLastError();
|
||||
@@ -94,18 +103,24 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr<Communicator> comm) {
|
||||
CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr<void> ctx, const void* input, void* output,
|
||||
size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op,
|
||||
cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
|
||||
const std::unordered_map<std::string, uintptr_t>&) {
|
||||
const std::unordered_map<std::string, uintptr_t>&,
|
||||
DataType accumDtype) {
|
||||
auto algoCtx = std::static_pointer_cast<AlgorithmCtx>(ctx);
|
||||
std::pair<int, int> blockAndThreadNum{nBlocks, nThreadsPerBlock};
|
||||
if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) {
|
||||
blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize);
|
||||
}
|
||||
// nBlocks must be at least nPeers for allpair — each block maps to one peer.
|
||||
const int nPeers = algoCtx->nRanksPerNode - 1;
|
||||
if (nPeers > 0 && blockAndThreadNum.first < nPeers) {
|
||||
return CommResult::CommInvalidArgument;
|
||||
}
|
||||
size_t sendBytes;
|
||||
CUdeviceptr sendBasePtr;
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
|
||||
size_t channelInOffset = (char*)input - (char*)sendBasePtr;
|
||||
|
||||
AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype);
|
||||
AllreduceFunc allreduce = dispatch<AllpairAdapter>(op, dtype, accumDtype);
|
||||
if (!allreduce) {
|
||||
WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast<int>(dtype));
|
||||
return CommResult::CommInvalidArgument;
|
||||
@@ -161,9 +176,9 @@ std::shared_ptr<Algorithm> AllreduceAllpairPacket::build() {
|
||||
[self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
|
||||
[self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
|
||||
[[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras) {
|
||||
int nThreadsPerBlock, const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
|
||||
return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock,
|
||||
extras);
|
||||
extras, accumDtype);
|
||||
},
|
||||
[self](std::shared_ptr<Communicator> comm, const void* input, void* output, size_t inputSize,
|
||||
[[maybe_unused]] size_t outputSize,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user