diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml new file mode 100644 index 00000000..c4abeaa7 --- /dev/null +++ b/.azure-pipelines/codecov.yml @@ -0,0 +1,93 @@ +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +jobs: +- job: CodeCoverageA100 + timeoutInMinutes: 40 + pool: + name: msccl-ci + variables: + - group: mscclpp + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci + vmssName: mscclpp-ci + gpuArch: '80' + +- job: CodeCoverageH100 + timeoutInMinutes: 40 + pool: + name: msccl-ci-h100 + variables: + - group: mscclpp + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + gpuArch: '90' + +- job: CodeCoverageMI300X + timeoutInMinutes: 40 + pool: + name: msccl-ci-mi300x + variables: + - group: mscclpp + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + platform: rocm + gpuArch: gfx942 diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml deleted file mode 100644 index a4ffcfc3..00000000 --- a/.azure-pipelines/integration-test-rocm.yml +++ /dev/null @@ -1,114 +0,0 @@ -trigger: - branches: - include: - - main - - release/* - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' - -pr: - branches: - include: - - main - - release/* - drafts: false - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' - -jobs: -- job: IntegrationTestRocm - displayName: Integration test ROCm - strategy: - matrix: - rocm6.2: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 - - pool: - name: mscclpp-rocm - container: - image: $[ variables['containerImage'] ] - options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 - - steps: - - task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: InstallRcclTest - displayName: Install rccl-test - inputs: - targetType: 'inline' - script: | - git clone https://github.com/ROCm/rccl-tests.git - cd rccl-tests - make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: InstallDep - displayName: Install dependencies - inputs: - targetType: 'inline' - script: | - set -e - git clone https://github.com/Azure/msccl-tools.git - cd msccl-tools - pip3 install . - - - task: Bash@3 - name: GenerateExectionFiles - displayName: Generate execution files - inputs: - targetType: 'inline' - script: | - set -e - git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users - cd msccl-users - mkdir execution-files - python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json - python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json - - - task: Bash@3 - name: AllReduceTest - displayName: Run mscclpp allReduce test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \ - -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: AllReduceWithExecutionFileTest - displayName: Run mscclpp allReduce with execution file - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \ - -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \ - -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \ - -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \ - -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 - workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index f6fe3a47..d5d5f9bd 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -41,11 +41,10 @@ jobs: image: $(containerImage) steps: - - template: templates/integration-test.yaml + - template: templates/integration-test.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: IntegrationTestH100 @@ -61,10 +60,9 @@ jobs: image: $(containerImage) steps: - - template: templates/integration-test.yaml + - template: templates/integration-test.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem perfBaselineFile: test/deploy/perf_ndmv5.jsonl gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 97a95c94..d4924879 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -37,33 +37,6 @@ jobs: image: $[ variables['containerImage'] ] steps: - - task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: mscclpp-ssh.key - - - task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - - task: Bash@3 displayName: Add HostEntry inputs: @@ -77,107 +50,46 @@ jobs: echo "Entry already exists, nothing to do." fi - - task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclit-vmss --resource-group msccl-IT + - template: templates/deploy.yml + parameters: + subscription: msccl-it + vmssName: mscclit-vmss + resourceGroup: msccl-IT - - task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' + - template: templates/run-remote-task.yml + parameters: + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test - - task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mp-ut - - task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh pytests - - task: Bash@3 - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - - task: Bash@3 - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' - kill $CHILD_PID - - - task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclit-vmss --resource-group msccl-IT + - template: templates/stop.yml + parameters: + subscription: msccl-it + vmssName: mscclit-vmss + resourceGroup: msccl-IT diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml similarity index 88% rename from .azure-pipelines/nccl-api-test.yaml rename to .azure-pipelines/nccl-api-test.yml index 4951c5bd..cc017412 100644 --- a/.azure-pipelines/nccl-api-test.yaml +++ b/.azure-pipelines/nccl-api-test.yml @@ -40,11 +40,10 @@ jobs: image: $(containerImage) steps: - - template: templates/nccl-test.yaml + - template: templates/nccl-test.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem nvccGencode: "-gencode=arch=compute_80,code=sm_80" - job: NcclTestH100 @@ -61,9 +60,8 @@ jobs: image: $(containerImage) steps: - - template: templates/nccl-test.yaml + - template: templates/nccl-test.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem nvccGencode: "-gencode=arch=compute_90,code=sm_90" \ No newline at end of file diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml new file mode 100644 index 00000000..43841079 --- /dev/null +++ b/.azure-pipelines/rccl-api-test.yml @@ -0,0 +1,47 @@ +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +jobs: +- job: RcclTestMI300X + displayName: Run MSCCLPP over RCCL Test (MI300X) + pool: + name: msccl-ci-mi300x + + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/rccl-test.yml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + gpuArch: gfx942 diff --git a/.azure-pipelines/templates/codecov.yml b/.azure-pipelines/templates/codecov.yml new file mode 100644 index 00000000..08797351 --- /dev/null +++ b/.azure-pipelines/templates/codecov.yml @@ -0,0 +1,110 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + buildType: Debug + cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON' + buildDisplayName: 'Build with coverage' + buildName: BuildCoverage + deployArgs: 'single-node-test true ${{ parameters.platform }}' + +- template: run-remote-task.yml + parameters: + name: TestsCoverageNonPerf + displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage + remoteScript: | + BUILD_PREFIX=$(cat build/BUILD_PREFIX) + STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c) + export GCOV_PREFIX=/root/mscclpp + export GCOV_PREFIX_STRIP=$STRIP_COUNT + + echo "Running unit_tests..." + ./build/bin/unit_tests + echo "unit_tests: PASSED" + + echo "Running mp_unit_tests -np 2..." + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests + echo "mp_unit_tests -np 2: PASSED" + + echo "Running mp_unit_tests -np 4..." + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests + echo "mp_unit_tests -np 4: PASSED" + +- template: run-remote-task.yml + parameters: + name: CaptureCoverage + displayName: Capture coverage data with lcov + remoteScript: | + BUILD_PREFIX=$(cat build/BUILD_PREFIX) + + GCOV_TOOL_ARG="" + if [ "${{ parameters.platform }}" = "rocm" ]; then + apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1 + GCOV_WRAPPER=$(mktemp) + printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER" + chmod +x "$GCOV_WRAPPER" + GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}" + fi + + lcov --version + LCOV_CAPTURE_ARGS="" + if lcov --help 2>&1 | grep -q "inconsistent"; then + LCOV_CAPTURE_ARGS="--ignore-errors inconsistent" + fi + + lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS} + if [ ! -s coverage.info ]; then + echo "ERROR: coverage.info was not generated." + exit 1 + fi + + lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info + lcov --list coverage.info + ls -la coverage.info + +- task: Bash@3 + name: FetchCoverage + displayName: Fetch coverage data from remote VM + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + HOST=$(head -1 ${HOSTFILE}) + ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \ + 'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info' + scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: UploadCodecov + displayName: Upload coverage to Codecov + inputs: + targetType: 'inline' + script: | + set -e + curl -Os https://cli.codecov.io/latest/linux/codecov + chmod +x codecov + ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }} + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml new file mode 100644 index 00000000..2f642f1d --- /dev/null +++ b/.azure-pipelines/templates/deploy.yml @@ -0,0 +1,151 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: resourceGroup + type: string + default: mscclpp +# Build parameters +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + default: '' +- name: buildType + type: string + default: 'Release' +- name: buildTests + type: string + default: 'true' +- name: cmakeArgs + type: string + default: '' +- name: buildName + type: string + default: 'Build' +- name: buildDisplayName + type: string + default: 'Build' +# Deploy parameters +- name: deployArgs + type: string + default: '' + +steps: +# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks. +- task: Bash@3 + name: EnsureAzureCLI + displayName: Ensure Azure CLI Installed + inputs: + targetType: inline + script: | + set -e + if command -v az >/dev/null 2>&1; then + az version >/dev/null + exit 0 + fi + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +# 1. Build +- task: Bash@3 + name: ${{ parameters.buildName }} + displayName: ${{ parameters.buildDisplayName }} + inputs: + targetType: 'inline' + script: | + set -e + rm -rf build + mkdir -p build && cd build + BUILD_TESTS_ARG="" + if [ "${{ parameters.buildTests }}" = "true" ]; then + BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON" + fi + + GPU_ARCH_ARG="" + if [ -n "${{ parameters.gpuArch }}" ]; then + GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}" + fi + + CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}' + if [ "${{ parameters.platform }}" = "rocm" ]; then + eval CXX=/opt/rocm/bin/hipcc cmake \ + -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_ROCM=ON \ + ${BUILD_TESTS_ARG} \ + ${GPU_ARCH_ARG} \ + ${CMAKE_EXTRA_ARGS} .. + else + eval cmake \ + -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_CUDA=ON \ + ${BUILD_TESTS_ARG} \ + ${GPU_ARCH_ARG} \ + ${CMAKE_EXTRA_ARGS} .. + fi + make -j + cd .. + pwd > build/BUILD_PREFIX + echo "=== Build artifacts ===" + ls -la build/bin/ || echo "ERROR: build/bin/ missing after build" + du -sh build/bin/* 2>/dev/null || true + workingDirectory: '$(System.DefaultWorkingDirectory)' + +# 2. Write CMake args for pip install on remote VMs +- task: Bash@3 + name: WritePipCmakeArgs + displayName: Write pip CMake args + inputs: + targetType: 'inline' + script: | + set -e + PIP_CMAKE_ARGS="" + if [ -n "${{ parameters.gpuArch }}" ]; then + PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}" + fi + CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}' + if [ -n "${CMAKE_EXTRA_ARGS}" ]; then + PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}" + fi + echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt + echo "pip CMake args: $(cat pip_cmake_args.txt)" + workingDirectory: '$(System.DefaultWorkingDirectory)' + +# 3. Download SSH key + install packages + start VMSS +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: mscclpp.pem + +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} + +# 4. Deploy test environment +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: ${{ parameters.deployArgs }} + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml deleted file mode 100644 index b9dac24b..00000000 --- a/.azure-pipelines/templates/integration-test.yaml +++ /dev/null @@ -1,242 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: perfBaselineFile - type: string - default: 'test/deploy/perf_ndmv4.jsonl' -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: inline - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: inline - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllGatherTest - displayName: Run mscclpp AllGather test - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - set -e; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: SendRecvTest - displayName: Run mscclpp SendRecv test - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllReduceTest - displayName: Run mscclpp AllReduce test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllToAll - displayName: Run mscclpp AllToAll test - inputs: - targetType: 'inline' - script: | - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: CheckPerfNumber - displayName: Check collective primitives performance - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - cd /root/mscclpp; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PythonAllReduceBenchmark - displayName: Python Allreduce Benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - set -e; \ - cd /root/mscclpp; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - python3 -m pip install .; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: FifoPerfBenchmark - displayName: FIFO Performance Benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - ./build/bin/perf/fifo_test"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml new file mode 100644 index 00000000..b686e4f2 --- /dev/null +++ b/.azure-pipelines/templates/integration-test.yml @@ -0,0 +1,76 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: perfBaselineFile + type: string + default: 'test/deploy/perf_ndmv4.jsonl' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test' + +- template: run-remote-task.yml + parameters: + name: AllGatherTest + displayName: Run mscclpp AllGather test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: SendRecvTest + displayName: Run mscclpp SendRecv test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: AllReduceTest + displayName: Run mscclpp AllReduce test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: AllToAll + displayName: Run mscclpp AllToAll test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: CheckPerfNumber + displayName: Check collective primitives performance + remoteScript: | + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }} + +- template: run-remote-task.yml + parameters: + name: PythonAllReduceBenchmark + displayName: Python Allreduce Benchmark + remoteScript: | + python3 -m pip install . + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} \ No newline at end of file diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml deleted file mode 100644 index bc804a94..00000000 --- a/.azure-pipelines/templates/nccl-test.yaml +++ /dev/null @@ -1,280 +0,0 @@ -# .azure-pipelines/templates/nccl-test.yaml -# ---------------------------------------- -# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container. -# -# Parameters: -# subscription – Azure subscription to use for VMSS start/stop -# sshKeySecureFile – the secureFile name for your SSH key - -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: nvccGencode - type: string - default: "-gencode=arch=compute_80,code=sm_80" - -steps: -- checkout: self -- checkout: git://One/msccl-users -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: mscclpp/test/deploy/deploy.sh - arguments: nccltest-single-node - workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp - -- task: Bash@3 - name: CopyMscclUsers - displayName: Copy msccl-users - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - DST_DIR="/tmp/mscclpp/msccl-users" - parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: GenerateExecutionFile -# displayName: Generate execution file -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp/msccl-users; \ -# mkdir -p execution-files; \ -# cd /root/mscclpp/msccl-users; \ -# bash algos/mscclpp_a100/generate_execution_plan.sh"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallNcclTests - displayName: Install NCCL Tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd; git clone https://github.com/NVIDIA/nccl-tests.git; \ - cd nccl-tests; \ - MPI=1 MPI_HOME=/usr/local/mpi make -j"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclAllReduceTest -# displayName: Run NCCL AllReduce Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclAllGatherTest -# displayName: Run NCCL AllGather Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclReduceScatterTest -# displayName: Run NCCL Reduce Scatter Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallNccl - displayName: Install NCCL - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd; git clone https://github.com/NVIDIA/nccl.git; \ - cd nccl; \ - make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclAllGatherFallbaclkToNcclTest - displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclAllReduceFallbaclkToNcclTest - displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclBroadcastFallbaclkToNcclTest - displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclReduceScatterFallbaclkToNcclTest -# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ -# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml new file mode 100644 index 00000000..211e2393 --- /dev/null +++ b/.azure-pipelines/templates/nccl-test.yml @@ -0,0 +1,76 @@ +# .azure-pipelines/templates/nccl-test.yml +# ---------------------------------------- +# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container. +# +# Parameters: +# subscription – Azure subscription to use for VMSS start/stop + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: nvccGencode + type: string + default: "-gencode=arch=compute_80,code=sm_80" + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + deployArgs: 'nccltest-single-node' + +- template: run-remote-task.yml + parameters: + name: InstallNcclTests + displayName: Install NCCL Tests + remoteScript: | + cd + git clone https://github.com/NVIDIA/nccl-tests.git + cd nccl-tests + MPI=1 MPI_HOME=/usr/local/mpi make -j + +- template: run-remote-task.yml + parameters: + name: InstallNccl + displayName: Install NCCL + remoteScript: | + LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4) + if [ -z "$LATEST_TAG" ]; then + echo "Failed to fetch latest NCCL tag" + exit 1 + fi + cd + git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl + make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }} + +- template: run-remote-task.yml + parameters: + name: RunNcclAllGatherFallbaclkToNcclTest + displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunNcclAllReduceFallbaclkToNcclTest + displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunNcclBroadcastFallbaclkToNcclTest + displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml new file mode 100644 index 00000000..8e247161 --- /dev/null +++ b/.azure-pipelines/templates/rccl-test.yml @@ -0,0 +1,63 @@ +# .azure-pipelines/templates/rccl-test.yml +# ------------------------------------------------ +# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container. +# +# Parameters: +# subscription – Azure subscription to use for VMSS start/stop +# vmssName – VMSS name to start/stop +# gpuArch – GPU architecture (e.g. gfx942) + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + default: "gfx942" + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: rocm + gpuArch: ${{ parameters.gpuArch }} + buildTests: false + deployArgs: 'single-node-test true rocm' + + +- template: run-remote-task.yml + parameters: + name: InstallRcclTests + displayName: Install RCCL Tests + remoteScript: | + cd + git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git + cd rocm-systems + git sparse-checkout init --cone + git sparse-checkout set projects/rccl-tests + git checkout + cd projects/rccl-tests + MPI=1 MPI_HOME=/usr/local/mpi make -j + +- template: run-remote-task.yml + parameters: + name: RunRcclAllGatherTest + displayName: Run RCCL AllGather Test with or without MSCCLPP Lib + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunRcclAllReduceTest + displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml new file mode 100644 index 00000000..37b3a7d7 --- /dev/null +++ b/.azure-pipelines/templates/run-remote-task.yml @@ -0,0 +1,27 @@ +parameters: +- name: name + type: string + default: '' +- name: displayName + type: string +- name: runRemoteArgs + type: string + default: '' +- name: remoteScript + type: string +- name: workingDirectory + type: string + default: '$(System.DefaultWorkingDirectory)' + +steps: +- task: Bash@3 + ${{ if ne(parameters.name, '') }}: + name: ${{ parameters.name }} + displayName: ${{ parameters.displayName }} + inputs: + targetType: 'inline' + script: | + test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD' + ${{ parameters.remoteScript }} + REMOTE_CMD + workingDirectory: ${{ parameters.workingDirectory }} diff --git a/.azure-pipelines/templates/stop.yml b/.azure-pipelines/templates/stop.yml new file mode 100644 index 00000000..40498c29 --- /dev/null +++ b/.azure-pipelines/templates/stop.yml @@ -0,0 +1,20 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: resourceGroup + type: string + default: mscclpp + +steps: +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} diff --git a/.azure-pipelines/templates/ut-executor.yml b/.azure-pipelines/templates/ut-executor.yml new file mode 100644 index 00000000..426daf17 --- /dev/null +++ b/.azure-pipelines/templates/ut-executor.yml @@ -0,0 +1,42 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true ${{ parameters.platform }}' + + +- template: run-remote-task.yml + parameters: + name: ExecutorTest + displayName: Run executor tests + remoteScript: | + python3 -m pip install . + PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans + TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place + mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml deleted file mode 100644 index aa21c407..00000000 --- a/.azure-pipelines/templates/ut-no-ib-env.yaml +++ /dev/null @@ -1,89 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: single-node-test false - workingDirectory: $(System.DefaultWorkingDirectory) - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml new file mode 100644 index 00000000..a62f1a77 --- /dev/null +++ b/.azure-pipelines/templates/ut-no-ib-env.yml @@ -0,0 +1,95 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + cmakeArgs: '-DMSCCLPP_USE_IB=OFF' + deployArgs: 'single-node-test false' + +- template: run-remote-task.yml + parameters: + name: UnitTests + displayName: Run mscclpp unit tests + remoteScript: | + ./build/bin/unit_tests + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + +- template: run-remote-task.yml + parameters: + name: StopContainer + displayName: Stop existing container + runRemoteArgs: '--no-docker --no-log' + remoteScript: | + sudo docker stop mscclpp-test || true + sudo docker rm mscclpp-test || true + +- task: Bash@3 + displayName: Remove generated SSH key files + inputs: + targetType: 'inline' + script: | + rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: BuildWithIb + displayName: Rebuild with IB + inputs: + targetType: 'inline' + script: | + set -e + rm -rf build + mkdir -p build && cd build + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_CUDA=ON \ + -DMSCCLPP_BUILD_TESTS=ON \ + -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: DeployTestEnvWithIb + displayName: Deploy Test Env (with IB build) + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: single-node-test false + workingDirectory: $(System.DefaultWorkingDirectory) + +- template: run-remote-task.yml + parameters: + name: PyTestsWithIbBuildDisableIb + displayName: Run pytests (IB build, IB tests disabled) + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml deleted file mode 100644 index 0ab733c9..00000000 --- a/.azure-pipelines/templates/ut-npkit.yaml +++ /dev/null @@ -1,145 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - - -steps: -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: inline - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - set -e; \ - cd /root/mscclpp; \ - mkdir -p build && cd build; \ - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \ - make -j"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: MpUnitTests - displayName: Run mscclpp multi-process unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: 'inline' - script: | - # set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml new file mode 100644 index 00000000..1bd89caf --- /dev/null +++ b/.azure-pipelines/templates/ut-npkit.yml @@ -0,0 +1,57 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"' + deployArgs: 'single-node-test' + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce" + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml deleted file mode 100644 index 093a6094..00000000 --- a/.azure-pipelines/templates/ut.yaml +++ /dev/null @@ -1,135 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' - - -- task: Bash@3 - name: UnitTests - displayName: Run mscclpp unit tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - ./build/bin/unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: MpUnitTests - displayName: Run mscclpp multi-process unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml new file mode 100644 index 00000000..743c66e6 --- /dev/null +++ b/.azure-pipelines/templates/ut.yml @@ -0,0 +1,49 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true ${{ parameters.platform }}' + + +- template: run-remote-task.yml + parameters: + name: UnitTests + displayName: Run mscclpp unit tests + remoteScript: | + ./build/bin/unit_tests + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 960f3eae..6b8c9eda 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -37,17 +37,16 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut.yaml + - template: templates/ut.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: UnitTestWithNpKitA100 @@ -59,17 +58,16 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-npkit.yaml + - template: templates/ut-npkit.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: UnitTestH100 @@ -79,17 +77,16 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut.yaml + - template: templates/ut.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' - job: UnitTestWithNpKitH100 @@ -99,21 +96,20 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-npkit.yaml + - template: templates/ut-npkit.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' - job: UnitTestNoIBEnv - timeoutInMinutes: 40 + timeoutInMinutes: 60 displayName: Test No IB Environment pool: name: msccl-ci-h100 @@ -121,15 +117,55 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-no-ib-env.yaml + - template: templates/ut-no-ib-env.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' + +- job: UnitTestMI300X + timeoutInMinutes: 40 + pool: + name: msccl-ci-mi300x + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/ut.yml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + platform: rocm + gpuArch: gfx942 + +- job: UnitTestExecutor + timeoutInMinutes: 60 + displayName: Test DSL Executor + pool: + name: msccl-ci-h100 + + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/ut-executor.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + gpuArch: '90' \ No newline at end of file diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 00000000..a98f1e89 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,24 @@ +codecov: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + target: 68% + threshold: 1% + patch: + default: + target: 80% + +flag_management: + default_rules: + carryforward: true + +ignore: + - "test/" + - "examples/" + - "python/" + - "tools/" + - "docs/" + - "docker/" diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 4cf9dbf8..9d7e7798 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -25,7 +25,7 @@ For C/C++/CUDA source code: ``` ## Formatting -If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files. +If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first. ## Building and Testing The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions. @@ -40,10 +40,10 @@ cd .. For testing after successful build: ```bash -# To run all tests +# To run tests with two GPUs - two is enough for most tests mpirun -np 2 ./build/bin/mp_unit_tests # To run tests excluding IB-related ones (when IB is not available) -mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib* +mpirun -np 2 ./build/bin/mp_unit_tests --filter=-*Ib* ``` For building a Python package: @@ -51,6 +51,12 @@ For building a Python package: python3 -m pip install -e . ``` +For Python tests after building the package: +```bash +# Run tests with 8 GPUs - adjust the number as needed +mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx +``` + For building documentation (see dependencies in `docs/requirements.txt`): ```bash cd docs diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b423e326..fb065141 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - name: Checkout repository @@ -51,7 +51,7 @@ jobs: df -h - name: Initialize CodeQL - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} @@ -62,11 +62,11 @@ jobs: - name: Build run: | rm -rf build && mkdir build && cd build - cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON .. - make -j + cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF .. + make -j4 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}/version:${{matrix.version}}" @@ -96,7 +96,7 @@ jobs: df -h - name: Initialize CodeQL - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} @@ -107,10 +107,10 @@ jobs: - name: Build run: | rm -rf build && mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON .. - make -j + CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF .. + make -j4 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}/version:${{matrix.version}}" diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml similarity index 100% rename from .github/workflows/doc-build.yaml rename to .github/workflows/doc-build.yml diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml deleted file mode 100644 index 900e8aba..00000000 --- a/.github/workflows/integration-test-backup.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: IntegrationTest - -on: workflow_dispatch - -jobs: - IntegrationTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - - - name: Lock GPU clock frequency - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: Run mscclpp AllGather test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - - - name: Run mscclpp SendRecv test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - - - name: Run mscclpp AllReduce test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - - - name: Run mscclpp AllToAll test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - - - name: Check collective primitives performance - run: | - set -e - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml index 5947b087..a9187e96 100644 --- a/.github/workflows/mscclpp-lang.yml +++ b/.github/workflows/mscclpp-lang.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml deleted file mode 100644 index 8849c353..00000000 --- a/.github/workflows/ut-backup.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: UnitTest - -on: workflow_dispatch - -jobs: - UnitTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - timeout-minutes: 30 - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - working-directory: ${{ github.workspace }} - - - name: LockGPUClock - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: UnitTests - run: | - ./build/bin/unit_tests - - - name: MpUnitTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests - - - name: PyTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x diff --git a/.gitignore b/.gitignore index 9c4da143..74307e67 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,9 @@ .vscode/ -.hypothesis/ build/ -dist/ +build_coverage/ __pycache__ .*.swp -.idea/ *.so +.pytest_cache/ +_codeql_detected_source_root docs/_static/versions.js -_codeql_detected_source_root \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6288dbb0..ef8b785a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +# Licensed under the MIT License. cmake_minimum_required(VERSION 3.25) project(mscclpp LANGUAGES CXX) @@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) # Options option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF) -option(MSCCLPP_BUILD_TESTS "Build tests" ON) +option(MSCCLPP_BUILD_TESTS "Build tests" OFF) option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON) option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON) option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON) @@ -56,6 +56,8 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_USE_IB "Use InfiniBand." ON) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF) +option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF) +option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON) set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).") if(MSCCLPP_BYPASS_GPU_CHECK) @@ -98,6 +100,62 @@ else() message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.") endif() endif() + +# Code coverage setup +if(MSCCLPP_ENABLE_COVERAGE) + if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading") + endif() + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + message(STATUS "Code coverage enabled") + + # Add coverage flags to C++ targets only (not CUDA) + add_compile_options($<$:--coverage>) + add_compile_options($<$:-O0>) + add_compile_options($<$:-g>) + add_link_options($<$:--coverage>) + + # Find lcov + find_program(LCOV_PATH lcov) + + if(NOT LCOV_PATH) + message(WARNING "lcov not found. Install lcov to generate coverage reports.") + endif() + + if(LCOV_PATH) + # Add coverage target + add_custom_target(coverage + COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..." + COMMAND ${LCOV_PATH} --directory . --zerocounters + + COMMAND ${CMAKE_COMMAND} -E echo "Running tests..." + COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure + + COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..." + COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info + + COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..." + COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info + + COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info" + + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Generating code coverage report" + ) + + # Add coverage clean target + add_custom_target(coverage-clean + COMMAND ${CMAKE_COMMAND} -E remove coverage.info + COMMAND ${LCOV_PATH} --directory . --zerocounters + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Cleaning coverage data" + ) + endif() + else() + message(WARNING "Code coverage is only supported with GCC or Clang compilers") + endif() +endif() if(MSCCLPP_GPU_ARCHS) string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS) string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}") @@ -166,12 +224,35 @@ if(MSCCLPP_USE_IB) if(NOT IBVERBS_FOUND) message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.") endif() + find_package(MLX5) + if(MLX5_FOUND) + message(STATUS "MLX5 Direct Verbs found: ${MLX5_LIBRARIES}") + else() + message(STATUS "MLX5 Direct Verbs not found, mlx5dv optimizations disabled") + endif() endif() find_package(NUMA REQUIRED) find_package(Threads REQUIRED) +option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON) +if(MSCCLPP_USE_ROCM) + set(MSCCLPP_USE_GDRCOPY OFF) +endif() +if(MSCCLPP_USE_GDRCOPY) + find_package(GDRCopy) + if(NOT GDRCOPY_FOUND) + message(STATUS "GDRCopy not found, disabling GDRCopy support") + set(MSCCLPP_USE_GDRCOPY OFF) + else() + message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}") + endif() +endif() + include(FetchContent) -FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) +FetchContent_Declare(json + GIT_REPOSITORY https://github.com/nlohmann/json.git + GIT_TAG v3.12.0 +) FetchContent_MakeAvailable(json) if("${INSTALL_PREFIX}" STREQUAL "") diff --git a/README.md b/README.md index 69ae5add..58586a30 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,16 @@ [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest) [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE) [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml) -[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/) +[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/) +[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp) | Testing Pipelines | Build Status | |--------------------------|-------------------| -| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | -| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) | -| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) | +| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | +| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | +| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) | +| NCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) | +| RCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) | A GPU-driven communication stack for scalable AI applications. diff --git a/VERSION b/VERSION index a3df0a69..ac39a106 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.0 +0.9.0 diff --git a/cmake/FindGDRCopy.cmake b/cmake/FindGDRCopy.cmake new file mode 100644 index 00000000..54e0ba1c --- /dev/null +++ b/cmake/FindGDRCopy.cmake @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Find the GDRCopy libraries (>= 2.5 required for gdr_pin_buffer_v2 / GDR_PIN_FLAG_FORCE_PCIE) +# +# The following variables are optionally searched for defaults +# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found +# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found +# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found + +# The following are set after configuration is done: +# GDRCOPY_FOUND +# GDRCOPY_INCLUDE_DIRS +# GDRCOPY_LIBRARIES + +find_path(GDRCOPY_INCLUDE_DIRS + NAMES gdrapi.h + HINTS + ${GDRCOPY_INCLUDE_DIR} + ${GDRCOPY_ROOT_DIR} + ${GDRCOPY_ROOT_DIR}/include + /usr/local/include + /usr/include) + +find_library(GDRCOPY_LIBRARIES + NAMES gdrapi + HINTS + ${GDRCOPY_LIB_DIR} + ${GDRCOPY_ROOT_DIR} + ${GDRCOPY_ROOT_DIR}/lib + /usr/local/lib + /usr/lib + /usr/lib/x86_64-linux-gnu) + +if(GDRCOPY_INCLUDE_DIRS) + include(CheckSymbolExists) + set(CMAKE_REQUIRED_INCLUDES ${GDRCOPY_INCLUDE_DIRS}) + set(CMAKE_REQUIRED_LIBRARIES ${GDRCOPY_LIBRARIES}) + check_symbol_exists(gdr_pin_buffer_v2 "gdrapi.h" GDRCOPY_HAS_PIN_BUFFER_V2) + unset(CMAKE_REQUIRED_LIBRARIES) + unset(CMAKE_REQUIRED_INCLUDES) + if(NOT GDRCOPY_HAS_PIN_BUFFER_V2) + message(STATUS "GDRCopy found but too old (gdr_pin_buffer_v2 not available). Requires >= 2.5.") + set(GDRCOPY_INCLUDE_DIRS GDRCOPY_INCLUDE_DIRS-NOTFOUND) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES) +mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES) diff --git a/cmake/FindMLX5.cmake b/cmake/FindMLX5.cmake new file mode 100644 index 00000000..9fd59127 --- /dev/null +++ b/cmake/FindMLX5.cmake @@ -0,0 +1,38 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Find the MLX5 Direct Verbs (mlx5dv) library +# +# The following variables are optionally searched for defaults +# MLX5_ROOT_DIR: Base directory where all MLX5 components are found +# MLX5_INCLUDE_DIR: Directory where MLX5 headers are found +# MLX5_LIB_DIR: Directory where MLX5 libraries are found + +# The following are set after configuration is done: +# MLX5_FOUND +# MLX5_INCLUDE_DIRS +# MLX5_LIBRARIES + +find_path(MLX5_INCLUDE_DIRS + NAMES infiniband/mlx5dv.h + HINTS + ${MLX5_INCLUDE_DIR} + ${MLX5_ROOT_DIR} + ${MLX5_ROOT_DIR}/include + /usr/local/include + /usr/include) + +find_library(MLX5_LIBRARIES + NAMES mlx5 + HINTS + ${MLX5_LIB_DIR} + ${MLX5_ROOT_DIR} + ${MLX5_ROOT_DIR}/lib + /usr/local/lib + /usr/lib + /usr/lib/x86_64-linux-gnu) + +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(MLX5 DEFAULT_MSG MLX5_INCLUDE_DIRS MLX5_LIBRARIES) +mark_as_advanced(MLX5_INCLUDE_DIRS MLX5_LIBRARIES) diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 04ba1f03..47436202 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp RUN apt-get update && \ apt-get install -y --no-install-recommends \ htop \ - lcov \ vim \ && \ apt-get autoremove -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* +# Install lcov 2.2 +RUN LCOV_VERSION="2.2" && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + cpanminus \ + gcc \ + make \ + perl \ + && \ + cpanm --notest \ + Capture::Tiny \ + DateTime \ + JSON::XS \ + Memory::Process \ + TimeDate \ + && \ + cd /tmp && \ + curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \ + tar xzf lcov.tar.gz && \ + cd lcov-${LCOV_VERSION} && \ + make install && \ + cd / && rm -rf /tmp/lcov* && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + # Install CMake 3.26.4 RUN OS_ARCH=$(uname -m) && \ CMAKE_VERSION="3.26.4" && \ @@ -24,6 +49,33 @@ RUN OS_ARCH=$(uname -m) && \ rm -rf ${CMAKE_HOME}.tar.gz && \ ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/ +# Install GDRCopy userspace library for CUDA targets +ARG TARGET="cuda13.0" +RUN if echo "$TARGET" | grep -q "^cuda"; then \ + GDRCOPY_VERSION="2.5.2" && \ + apt-get update -y && \ + apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \ + cd /tmp && \ + curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \ + tar xzf gdrcopy.tar.gz && \ + cd gdrcopy-${GDRCOPY_VERSION}/packages && \ + ./build-deb-packages.sh -k -t && \ + dpkg -i libgdrapi_*.deb && \ + cd / && rm -rf /tmp/gdrcopy* && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/*; \ + fi + +# Install ROCm-specific packages if building for ROCm +RUN if echo "$TARGET" | grep -q "^rocm"; then \ + apt-get update -y && \ + apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/*; \ + fi + # Create Python venv RUN python3 -m venv /root/venv && \ echo 'source /root/venv/bin/activate' >> /root/.bashrc @@ -32,10 +84,13 @@ ENV PATH="/root/venv/bin:${PATH}" # Install Python dependencies ADD . /tmp/mscclpp WORKDIR /tmp/mscclpp -ARG TARGET="cuda13.0" RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ + if echo "$TARGET" | grep -q "^rocm"; then \ + export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \ + fi && \ pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r python/requirements_${target_type}.txt + pip install --no-cache-dir -r python/requirements_${target_type}.txt && \ + pip install --no-cache-dir coverage xlsxwriter # Cleanup RUN rm -rf /tmp/mscclpp diff --git a/docker/base-x-rocm.dockerfile b/docker/base-x-rocm.dockerfile deleted file mode 100644 index 525ba1d4..00000000 --- a/docker/base-x-rocm.dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp - -ENV DEBIAN_FRONTEND=noninteractive - -ENV RCCL_VERSION=rocm-6.2.0 -ARG GPU_ARCH=gfx942 -ENV ARCH_TARGET=${GPU_ARCH} -RUN cd /tmp && \ - git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \ - cd rccl && \ - ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \ - cd .. && \ - rm -rf /tmp/rccl - -WORKDIR / diff --git a/docker/build.sh b/docker/build.sh index e9b10c3a..89568e19 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -4,38 +4,39 @@ set -e declare -A baseImageTable baseImageTable=( - ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" - ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" - ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" - ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" + ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04" ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" - ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04" + ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04" ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04" - ["rocm6.2"]="rocm/rocm-terminal:6.2.1" + ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2" ) declare -A extraLdPathTable extraLdPathTable=( - ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" - ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" - ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" + ["cuda11.8"]="/usr/local/cuda-11.8/compat" + ["cuda12.4"]="/usr/local/cuda-12.4/compat" + ["cuda12.8"]="/usr/local/cuda-12.8/compat" + ["cuda12.9"]="/usr/local/cuda-12.9/compat" + ["cuda13.0"]="/usr/local/cuda-13.0/compat" ["rocm6.2"]="/opt/rocm/lib" ) declare -A ofedVersionTable ofedVersionTable=( + ["cuda11.8"]="23.07-0.5.1.2" ["cuda12.4"]="23.07-0.5.1.2" ["cuda12.8"]="24.10-1.1.4.0" ["cuda12.9"]="24.10-1.1.4.0" ["cuda13.0"]="24.10-3.2.5.0" + ["rocm6.2"]="24.10-1.1.4.0" ) TARGET=${1} OS_ARCH=$(uname -m) print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" + echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then @@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \ if [[ ${TARGET} == rocm* ]]; then echo "Building ROCm base image..." - docker build -t ${TAG_BASE} \ - -f docker/base-x-rocm.dockerfile \ - --build-arg BASE_IMAGE=${TAG_TMP} \ - --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ - --build-arg TARGET=${TARGET} \ - --build-arg GPU_ARCH="gfx942" . - docker rmi ${TAG_TMP} else echo "Building CUDA base image..." - docker tag ${TAG_TMP} ${TAG_BASE} - docker rmi --no-prune ${TAG_TMP} fi +docker tag ${TAG_TMP} ${TAG_BASE} +docker rmi --no-prune ${TAG_TMP} docker build -t ${TAG_BASE_DEV} \ -f docker/base-dev-x.dockerfile \ diff --git a/docs/Makefile b/docs/Makefile index 5bc7422e..bf82c03a 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,7 +5,7 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build -SPHINXMULTIVERSION ?= sphinx-multiversion +SPHINXMULTIVERSION ?= python3 build_multiversion.py SOURCEDIR = . BUILDDIR = _build diff --git a/docs/_static/version-selector.js b/docs/_static/version-selector.js index 0efc47fe..7622aefd 100644 --- a/docs/_static/version-selector.js +++ b/docs/_static/version-selector.js @@ -26,27 +26,53 @@ * @returns {string} The base path (e.g., '/mscclpp' or '') */ function detectBasePath() { - const path = window.location.pathname; - // Match pattern: /base-path/vX.Y.Z/... or /base-path/main/... - // The base path is everything before the version or main directory - const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/); - if (match && match[1]) { - return match[1]; - } - // Check if we're at a root that's actually a project site - // Look for common indicators like the repository name in the path - const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/); - if (projectMatch) { - // Verify this isn't a version path at root - const potentialBase = projectMatch[1]; - if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') { - // Check if the remaining path contains version info - const remainingPath = path.substring(potentialBase.length); - if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) { - return potentialBase; + // Most reliable method: detect from this script's own URL + // The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js + const scripts = document.getElementsByTagName('script'); + for (let i = 0; i < scripts.length; i++) { + const src = scripts[i].src; + if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) { + try { + const url = new URL(src); + const scriptPath = url.pathname; + // Extract base path: everything before /_static/version-selector.js + // But also strip version directories like /v0.8.0/ or /main/ + const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/); + if (match) { + let basePath = match[1] || ''; + // Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp) + basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, ''); + return basePath; + } + } catch (e) { + // URL parsing failed, continue to fallback + // Log a warning to aid debugging when the primary detection method fails. + if (typeof console !== 'undefined' && typeof console.warn === 'function') { + console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e); + } } } } + + // Fallback: try to detect from URL path + const path = window.location.pathname; + const segments = path.split('/').filter(s => s.length > 0); + + if (segments.length >= 1) { + const firstSegment = segments[0]; + // If first segment is not a version tag (vX.Y.Z), not 'main', and + // does not look like a file name (no '.' in the segment), then it's + // the GitHub Pages project base path (e.g., 'mscclpp'). + // This handles both: + // /mscclpp/v0.8.0/index.html -> base is /mscclpp + // /mscclpp/index.html -> base is /mscclpp + // while avoiding treating root files like /index.html as a base path. + if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) { + return '/' + firstSegment; + } + } + + // No base path (root site or local development) return ''; } diff --git a/docs/build_multiversion.py b/docs/build_multiversion.py new file mode 100644 index 00000000..ace20fc0 --- /dev/null +++ b/docs/build_multiversion.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Wrapper around sphinx-multiversion that patches copy_tree to generate +_version.py in each tag checkout. This is needed because setuptools_scm +generates _version.py at build time, but sphinx-multiversion uses +`git archive` which only contains committed files. + +Usage (called by Makefile): + python3 build_multiversion.py [sphinx-opts...] +""" + +import os +import re +import subprocess +import sys + +import sphinx_multiversion.git as smv_git +from sphinx_multiversion import main as smv_main + +# Save the original copy_tree +_original_copy_tree = smv_git.copy_tree + + +def _patched_copy_tree(gitroot, src, dst, reference, sourcepath="."): + """Call original copy_tree, then generate _version.py from the VERSION file.""" + _original_copy_tree(gitroot, src, dst, reference, sourcepath) + + # Extract version from the tag name (e.g., "v0.9.0" -> "0.9.0") + refname = getattr(reference, "refname", "") or "" + match = re.search(r"v(\d+\.\d+\.\d+)", refname) + if not match: + return + + version = match.group(1) + version_py_dir = os.path.join(dst, "python", "mscclpp") + if os.path.isdir(version_py_dir): + version_py = os.path.join(version_py_dir, "_version.py") + if not os.path.exists(version_py): + with open(version_py, "w") as f: + f.write(f'__version__ = "{version}"\n') + + +# Monkey-patch +smv_git.copy_tree = _patched_copy_tree + +if __name__ == "__main__": + sys.exit(smv_main(sys.argv[1:])) diff --git a/docs/conf.py b/docs/conf.py index fdfb8d66..52321465 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,6 +11,18 @@ import sys import importlib.util from pathlib import Path +from unittest.mock import MagicMock + + +class NamedMock(MagicMock): + def __getattr__(self, name): + attr = super().__getattr__(name) + if isinstance(attr, MagicMock): + # Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection. + attr.__name__ = name + attr.__qualname__ = name + return attr + # Add the python package to sys.path so Sphinx can find it project_root = Path(__file__).parent.parent @@ -63,7 +75,7 @@ autodoc_default_options = { "show-inheritance": True, } # only mock the C-extension when using the source tree -autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"] +autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"] autodoc_typehints = "description" napoleon_google_docstring = True napoleon_numpy_docstring = True @@ -71,6 +83,10 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "numpy": ("https://numpy.org/doc/stable/", None), } +mock_mscclpp = NamedMock() +# Set attributes to satisfy Sphinx autodoc inspection. +mock_mscclpp.env.return_value.cache_dir = "_mscclpp" +sys.modules["mscclpp._mscclpp"] = mock_mscclpp templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] diff --git a/docs/dsl/quick_start.md b/docs/dsl/quick_start.md index 6c32ec32..afccd48e 100644 --- a/docs/dsl/quick_start.md +++ b/docs/dsl/quick_start.md @@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol python3 -m mscclpp --install ``` +This installs bundled default execution plans into `~/.cache/mscclpp/default` by default. +If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`. +`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path. + ## Your First Algorithm: AllGather Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features. diff --git a/docs/dsl/results.md b/docs/dsl/results.md index a34eae5b..a1adad2a 100644 --- a/docs/dsl/results.md +++ b/docs/dsl/results.md @@ -56,9 +56,12 @@ python3 -m mscclpp --install After installation, the generated JSON execution plan can be found at: ``` -~/.cache/mscclpp_default/ +~/.cache/mscclpp/default/ ``` +If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`. +`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path. + **Performance Results:** The figure below shows the performance characteristics for small message sizes in a two-node configuration: diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md index 236dd8ef..b4e4fcdf 100644 --- a/docs/guide/mscclpp-torch-integration.md +++ b/docs/guide/mscclpp-torch-integration.md @@ -129,7 +129,7 @@ class CustomizedComm: self._algo_large = [ algo for algo in algorithms if algo.collective == "allreduce" - and algo.name == "default_allreduce_nvls_with_copy" + and algo.name == "default_allreduce_nvls_warp_pipeline" ][0] def all_reduce(self, tensor: torch.Tensor, stream=None): @@ -332,7 +332,8 @@ public: size_t inputSize, size_t outputSize, mscclpp::DataType dtype, mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) { + const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->kernelFunc(ctx, input, output, inputSize, dtype, stream); }, // Context initialization function @@ -343,8 +344,8 @@ public: }, // Context key generation function [self](const void* input, void* output, - size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { - return self->generateContextKey(input, output, inputSize, outputSize, dtype); + size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory); } ); } @@ -468,3 +469,196 @@ stream_handle = torch.cuda.current_stream().cuda_stream All examples are in [`examples/torch-integration/`](../../examples/torch-integration/). +--- + +## Performance Tuning + +The default algorithms use a fixed heuristic to select algorithms based on message size. For production workloads, you can achieve significantly better performance by **auto-tuning** — benchmarking every candidate algorithm, block count, and thread count for each message size at startup, then using the fastest configuration at runtime. + +**Full example:** [customized_comm_with_tuning.py](../../examples/torch-integration/customized_comm_with_tuning.py) + +### How It Works + +1. **Candidate selection** — For each power-of-two message size from 1 KB to 128 MB, the tuner picks the applicable algorithms: + - All sizes (when NVLS is supported): `default_allreduce_nvls_zero_copy` + - Small messages (≤ 4 MB): `default_allreduce_nvls_packet`, `default_allreduce_packet` + - Large messages (≥ 512 KB): `default_allreduce_rsag_zero_copy` + +2. **Grid search** — Each candidate is run with every combination of block counts (`4, 8, 16, … 128`) and thread counts (`512, 768, 1024`). Results are captured in a CUDA graph and timed. + +3. **Cross-rank consensus** — Elapsed times are averaged across all ranks with an allreduce so every GPU selects the same configuration. + +4. **Runtime dispatch** — `get_tuned_config()` rounds the actual message size up to the next power of two and returns the winning `(algorithm, nblocks, nthreads)` triple. + +### Symmetric Memory Allocation + +Algorithms like `default_allreduce_nvls_zero_copy` require **symmetric memory** — memory where the buffer offset is the same for each rank, allocated via `mscclpp.RawGpuBuffer` (`cuMemAlloc`). Regular `torch.rand()` or `torch.empty()` allocations cannot be used with these algorithms because they do not guarantee the same offset across ranks. Instead, allocate a single large buffer and reuse it for all message sizes: + +```python +# Allocate symmetric memory via RawGpuBuffer and wrap as a PyTorch tensor +tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16)) +tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor) +tune_tensor.normal_() +``` + +When executing an algorithm with symmetric memory, pass `symmetric_memory=True`: + +```python +def _run_algo(self, algo, tensor, size, nblocks, nthreads): + return algo.execute( + comm=self.comm.communicator, + input_buffer=tensor.data_ptr(), + output_buffer=tensor.data_ptr(), + input_size=size, + output_size=size, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype), + op=mscclpp.ReduceOp.SUM, + stream=torch.cuda.current_stream().cuda_stream, + nblocks=nblocks, + nthreads_per_block=nthreads, + symmetric_memory=True, + ) +``` + +### Loading Candidate Algorithms + +The same `load_algorithms` helper from Approach 1 is reused. The tuner extracts multiple algorithm objects: + +```python +algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank) + +self._algorithm_nvls_packet = [ + algo for algo in algorithms + if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet" +][0] + +self._algorithm_rsag_zero_copy = [ + algo for algo in algorithms + if algo.collective == "allreduce" and algo.name == "default_allreduce_rsag_zero_copy" +][0] + +self._algorithm_packet = [ + algo for algo in algorithms + if algo.collective == "allreduce" and algo.name == "default_allreduce_packet" +][0] + +# NVLS zero-copy is only available on supported hardware +if mscclpp.is_nvls_supported(): + self._algorithm_nvls_zero_copy = [ + algo for algo in algorithms + if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_zero_copy" + ][0] +``` + +### The Tuning Loop + +The tuning loop iterates over message sizes, candidate algorithms, and kernel launch parameters. CUDA graphs are used for accurate timing. Note the use of `RawGpuBuffer` for symmetric memory: + +```python +def _tune(self, n_warmup, n_graph_launches, n_ops_per_graph): + sizes = [1 << i for i in range(10, 28)] + self.best_configs = {1024: (self._algorithm_nvls_packet, 0, 0)} + + # Use RawGpuBuffer for symmetric memory allocation + tune_tensor = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16)) + tune_tensor = torch.utils.dlpack.from_dlpack(tune_tensor) + tune_tensor.normal_() + candidates_nblocks = [4, 8, 16, 24, 32, 48, 64, 128] + candidates_nthreads = [512, 768, 1024] + + for size in sizes: + algos = [] + if mscclpp.is_nvls_supported(): + algos.append(self._algorithm_nvls_zero_copy) + if size <= 4 * 1024 * 1024: + algos.append(self._algorithm_nvls_packet) + algos.append(self._algorithm_packet) + if size >= 512 * 1024: + algos.append(self._algorithm_rsag_zero_copy) + + best_time = float("inf") + best_config = None + + for algo in algos: + for nb in candidates_nblocks: + for nt in candidates_nthreads: + if self._run_algo(algo, tune_tensor, size, nb, nt) != 0: + continue # skip unsupported configs + + # Warmup, then time with CUDA graphs + # ... (see full example for graph capture logic) + + # Average timing across ranks + time_tensor = torch.full( + (self.world_size,), elapsed, dtype=torch.float64, device="cuda" + ).to(dtype=torch.float32) + self.all_reduce(time_tensor, op=torch.distributed.ReduceOp.SUM) + avg_time = time_tensor[self.rank].item() / self.world_size + + if avg_time < best_time: + best_time = avg_time + best_config = (algo, nb, nt) + + if best_config: + self.best_configs[size] = best_config +``` + +### Dispatching with Tuned Configuration + +At runtime, round the message size to the next power of two and look up the best configuration. When the tensor is allocated from `RawGpuBuffer` (`cuMemAlloc`) and the buffer offset is the same for each rank, pass `symmetric_memory=True` to the `execute()` call (see the [Symmetric Memory Allocation](#symmetric-memory-allocation) section above): + +```python +def get_tuned_config(self, size): + if size < 1024: + target_size = 1024 + elif size > 256 * 1024 * 1024: + target_size = 256 * 1024 * 1024 + else: + target_size = 1 << (size - 1).bit_length() + return self.best_configs.get(target_size) + +def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None): + config = self.get_tuned_config(tensor.nbytes) + algo, nblocks, nthreads = config if config else (self._algorithm_nvls_packet, 0, 0) + algo.execute( + comm=self.comm.communicator, + input_buffer=tensor.data_ptr(), + output_buffer=tensor.data_ptr(), + input_size=tensor.nbytes, + output_size=tensor.nbytes, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype), + op=mscclpp.ReduceOp.SUM, + stream=stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream, + nblocks=nblocks, + nthreads_per_block=nthreads, + ) +``` + +### Benchmarking with Symmetric Memory + +When benchmarking tuned configurations, use the same `RawGpuBuffer` allocation pattern. Create one large buffer and slice it for each message size: + +```python +def benchmark(self, n_warmup=10, n_graph_launches=10, n_iter_per_graph=100): + # Allocate a single large RawGpuBuffer (symmetric memory) and reuse for all sizes + dtype = torch.float16 + bench_buf = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(dtype)) + bench_buf = torch.utils.dlpack.from_dlpack(bench_buf) + bench_buf.normal_() + + for size in sizes: + n_elements = size // bench_buf.element_size() + tensor = bench_buf[:n_elements] + + # Capture CUDA graph, warmup, and time... + with torch.cuda.graph(g, stream=capture_stream): + for _ in range(n_iter_per_graph): + self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) +``` + +### Running the Tuning Example + +```bash +MSCCLPP_MASTER_ADDR= MSCCLPP_MASTER_PORT= \ + torchrun --nnodes=1 --nproc_per_node=8 customized_comm_with_tuning.py +``` diff --git a/docs/py_api.rst b/docs/py_api.rst index 5ea39bc3..7acc9273 100644 --- a/docs/py_api.rst +++ b/docs/py_api.rst @@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API. :toctree: py_api :recursive: - mscclpp.comm - mscclpp.utils - mscclpp.language + mscclpp diff --git a/docs/quickstart.md b/docs/quickstart.md index 04a26466..c9c98128 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -31,6 +31,9 @@ ``` If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)). * (Optional, for benchmarks) MPI + * (Optional, for NVIDIA platforms) [GDRCopy](https://github.com/NVIDIA/gdrcopy) >= 2.5.1 + * GDRCopy is required for IB `HostNoAtomic` mode, which uses CPU-side signal forwarding to GPU memory via BAR1 mappings. This mode is used on platforms where RDMA atomics are not available (e.g., when using Data Direct Virtual Functions). + * Install GDRCopy from source or via packages. See the [GDRCopy installation guide](https://github.com/NVIDIA/gdrcopy#installation). * Others * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites). * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above. @@ -42,7 +45,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se ```bash # For NVIDIA platforms -$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash +$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash # For AMD platforms $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash ``` @@ -171,7 +174,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` @@ -189,14 +191,12 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; -mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 +mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features. diff --git a/docs/tutorials/03-memory-channel.md b/docs/tutorials/03-memory-channel.md index 00e2192b..c6a8b9e1 100644 --- a/docs/tutorials/03-memory-channel.md +++ b/docs/tutorials/03-memory-channel.md @@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes); mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport); ``` -Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials. +Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials. **GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope. @@ -251,6 +251,37 @@ columns 2 Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed. +(mc-cross-node)= +## Cross-node Execution + +For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms. + +We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL. + +Run the program on two nodes with command line arguments: + +``` +./bidir_memory_channel [ ] +``` + +For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally. + +On Node 0 (Rank 0): +```bash +$ ./bidir_memory_channel 192.168.0.1:50000 0 0 +``` + +On Node 1 (Rank 1): +```bash +$ ./bidir_memory_channel 192.168.0.1:50000 1 0 +``` + +You should see output indicating successful data transfer. + +```{tip} +If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`). +``` + ## Summary and Next Steps In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`. diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu index 436a6a94..02df3685 100644 --- a/examples/customized-collective-algorithm/customized_allgather.cu +++ b/examples/customized-collective-algorithm/customized_allgather.cu @@ -101,15 +101,17 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { "allgather", "allgather", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { + [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, + bool symmetricMemory) { return self->generateAllgatherContextKey(input, output, inputSize, outputSize, - static_cast(dtype)); + static_cast(dtype), symmetricMemory); }); return allgatherAlgo; } @@ -191,7 +193,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { } mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize, - size_t outputSize, ncclDataType_t dtype) { + size_t outputSize, ncclDataType_t dtype, bool) { return {(void*)input, output, inputSize, outputSize, 0}; } }; diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu index 10400ddc..907b3ada 100644 --- a/examples/torch-integration/customized_allgather.cu +++ b/examples/torch-integration/customized_allgather.cu @@ -69,14 +69,16 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { "allgather", "allgather", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { - return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype); + [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, + bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory); }); return allgatherAlgo; } @@ -159,7 +161,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { } mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize, - size_t outputSize, mscclpp::DataType dtype) { + size_t outputSize, mscclpp::DataType dtype, bool) { return {(void*)input, output, inputSize, outputSize, 0}; } }; diff --git a/examples/torch-integration/customized_comm_with_default_algo.py b/examples/torch-integration/customized_comm_with_default_algo.py index 78560f15..3e933107 100644 --- a/examples/torch-integration/customized_comm_with_default_algo.py +++ b/examples/torch-integration/customized_comm_with_default_algo.py @@ -15,7 +15,9 @@ import ipaddress def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection: collection_builder = mscclpp.ext.AlgorithmCollectionBuilder() return collection_builder.build_default_algorithms( - scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank + scratch_buffer=scratch_buffer.data_ptr(), + scratch_buffer_size=scratch_buffer.nbytes, + rank=rank, ) @@ -59,7 +61,7 @@ class CustomizedComm: self._algorithm_nvls_nonzero_copy = [ algo for algo in algorithms - if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy" + if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline" ][0] def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None): diff --git a/examples/torch-integration/customized_comm_with_tuning.py b/examples/torch-integration/customized_comm_with_tuning.py new file mode 100644 index 00000000..060a0097 --- /dev/null +++ b/examples/torch-integration/customized_comm_with_tuning.py @@ -0,0 +1,476 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py + +import os +import ipaddress + +import netifaces as ni +import torch +import mscclpp +import mscclpp.ext +import mscclpp.utils as mscclpp_utils + +# -- Helpers ------------------------------------------------------------------ + + +def _make_tensor(size_bytes: int, dtype: torch.dtype) -> torch.Tensor: + """Allocate a tensor backed by RawGpuBuffer (symmetric memory).""" + # PyTorch's from_dlpack does not support certain float8 DLPack type codes. + # Work around by importing as uint8 and reinterpreting via .view(). + _DLPACK_UNSUPPORTED = (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz) + if dtype in _DLPACK_UNSUPPORTED: + dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(torch.uint8)) + return torch.utils.dlpack.from_dlpack(dlpack).view(dtype) + dlpack = mscclpp.RawGpuBuffer(size_bytes).to_dlpack(data_type=str(dtype)) + return torch.utils.dlpack.from_dlpack(dlpack) + + +def _load_algorithms(scratch: torch.Tensor, rank: int): + return mscclpp.ext.AlgorithmCollectionBuilder().build_default_algorithms( + scratch_buffer=scratch.data_ptr(), + scratch_buffer_size=scratch.nbytes, + rank=rank, + ) + + +def _interfaces_for_ip(ip: str): + target = ipaddress.ip_address(ip) + for iface in ni.interfaces(): + addrs = ni.ifaddresses(iface) + if ni.AF_INET in addrs: + for link in addrs[ni.AF_INET]: + if "addr" in link and ipaddress.ip_address(link["addr"]) == target: + return iface + return None + + +def _to_mscclpp_op(op) -> mscclpp.ReduceOp: + if op == torch.distributed.ReduceOp.SUM: + return mscclpp.ReduceOp.SUM + if op == torch.distributed.ReduceOp.MIN: + return mscclpp.ReduceOp.MIN + raise ValueError(f"unsupported op: {op}") + + +def _round_pow2(size: int) -> int: + """Round up to next power-of-2, clamped to [1024, 256 MB].""" + size = max(size, 1024) + size = min(size, 256 << 20) + return 1 << (size - 1).bit_length() + + +# -- CustomizedComm ----------------------------------------------------------- + + +class CustomizedComm: + """Exposes all_reduce, all_gather, barrier with lazy per-size tuning.""" + + _TUNE_N_WARMUP = 5 + _TUNE_N_GRAPH_LAUNCHES = 10 + _TUNE_N_OPS_PER_GRAPH = 100 + _CANDIDATE_NBLOCKS = [4, 8, 16, 24, 32, 48, 64, 128] + _CANDIDATE_NTHREADS = [512, 768, 1024] + _NBLOCKS_LIMIT = { + "default_allreduce_nvls_packet": 16, + "default_allreduce_packet": 56, + "default_allreduce_allpair_packet": 56, + "default_allreduce_fullmesh": 64, + "default_allgather_fullmesh2": 32, + } + + def __init__(self, comm: mscclpp.CommGroup, symmetric_memory: bool = False): + self.comm = comm + self.rank = comm.my_rank + self.world_size = comm.nranks + self.symmetric_memory = symmetric_memory + self._nvls = mscclpp.is_nvls_supported() + + self._scratch = _make_tensor(1 << 27, torch.float16) + self._barrier_tensor = _make_tensor(4096, torch.float32) + + algos = _load_algorithms(self._scratch, self.rank) + self._algos = {(a.collective, a.name): a for a in algos} + + # {collective: {rounded_size: (algo, nblocks, nthreads)}} + self._tune_cache: dict[str, dict[int, tuple]] = {"allreduce": {}, "allgather": {}} + self._tune_buf = None + self._time_buf = None + + def _algo(self, collective: str, name: str): + return self._algos.get((collective, name)) + + def _default_ar_config(self): + """Fallback allreduce config for barrier / timing sync.""" + pkt = self._algo("allreduce", "default_allreduce_nvls_packet") + if self._nvls and pkt: + return (pkt, 0, 0) + return (self._algo("allreduce", "default_allreduce_packet"), 0, 0) + + # -- low-level execute -- + + def _exec_ar(self, tensor, algo, nb, nt, op=mscclpp.ReduceOp.SUM, stream=None, accum_dtype=None, sym=True): + s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream + ret = algo.execute( + comm=self.comm.communicator, + input_buffer=tensor.data_ptr(), + output_buffer=tensor.data_ptr(), + input_size=tensor.nbytes, + output_size=tensor.nbytes, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(tensor.dtype), + op=op, + stream=s, + nblocks=nb, + nthreads_per_block=nt, + symmetric_memory=sym, + accum_dtype=accum_dtype, + ) + if ret != 0: + print(f"Rank {self.rank}: {algo.name} failed ({ret})") + return ret + + def _exec_ag(self, inp, out, algo, nb, nt, stream=None, sym=None): + if sym is None: + sym = self.symmetric_memory + s = stream.cuda_stream if stream else torch.cuda.current_stream().cuda_stream + ret = algo.execute( + comm=self.comm.communicator, + input_buffer=inp.data_ptr(), + output_buffer=out.data_ptr(), + input_size=inp.nbytes, + output_size=out.nbytes, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(inp.dtype), + op=mscclpp.ReduceOp.NOP, + stream=s, + nblocks=nb, + nthreads_per_block=nt, + symmetric_memory=sym, + ) + if ret != 0: + print(f"Rank {self.rank}: AG {algo.name} failed ({ret})") + return ret + + def _barrier_internal(self): + a, nb, nt = self._default_ar_config() + self._exec_ar(self._barrier_tensor, a, nb, nt, sym=True) + + # -- lazy tuning -- + + def _ensure_tune_bufs(self): + if self._tune_buf is None: + self._tune_buf = _make_tensor(1 << 27, torch.float16) + self._tune_buf.normal_() + self._time_buf = _make_tensor(4096, torch.float32) + return self._tune_buf + + def _ar_candidates(self, size: int): + out = [] + if size <= 4 << 20: + a = self._algo("allreduce", "default_allreduce_nvls_packet") + if self._nvls and a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_packet") + if a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_allpair_packet") + if a: + out.append(a) + if size >= 512 << 10: + a = self._algo("allreduce", "default_allreduce_nvls_zero_copy") + if self._nvls and self.symmetric_memory and a: + out.append(a) + a = self._algo("allreduce", "default_allreduce_rsag_zero_copy") + if a: + out.append(a) + if torch.version.hip is not None: + a = self._algo("allreduce", "default_allreduce_fullmesh") + if a: + out.append(a) + return out + + def _ag_candidates(self): + a = self._algo("allgather", "default_allgather_fullmesh2") + return [a] if a else [] + + def _run_tune(self, collective, algo, buf, size, nb, nt): + """Single tune invocation for either collective.""" + if collective == "allreduce": + return algo.execute( + comm=self.comm.communicator, + input_buffer=buf.data_ptr(), + output_buffer=buf.data_ptr(), + input_size=size, + output_size=size, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype), + op=mscclpp.ReduceOp.SUM, + stream=torch.cuda.current_stream().cuda_stream, + nblocks=nb, + nthreads_per_block=nt, + symmetric_memory=True, + ) + else: + total = size * self.world_size + out_ptr = buf.data_ptr() + return algo.execute( + comm=self.comm.communicator, + input_buffer=out_ptr + self.rank * size, + output_buffer=out_ptr, + input_size=size, + output_size=total, + dtype=mscclpp_utils.torch_dtype_to_mscclpp_dtype(buf.dtype), + op=mscclpp.ReduceOp.NOP, + stream=torch.cuda.current_stream().cuda_stream, + nblocks=nb, + nthreads_per_block=nt, + symmetric_memory=False, + ) + + def _tune_size(self, collective: str, target_size: int): + """Auto-tune one (collective, target_size) pair and cache result.""" + buf = self._ensure_tune_bufs() + cands = self._ar_candidates(target_size) if collective == "allreduce" else self._ag_candidates() + + best_time, best_cfg = float("inf"), None + used = set() + run = lambda a, nb, nt: self._run_tune(collective, a, buf, target_size, nb, nt) + + for algo in cands: + nb_limit = self._NBLOCKS_LIMIT.get(algo.name, 128) + for nb in self._CANDIDATE_NBLOCKS: + if nb > nb_limit: + continue + for nt in self._CANDIDATE_NTHREADS: + # Feasibility — sync result across ranks so all agree + ret = run(algo, nb, nt) + torch.cuda.synchronize() + self._time_buf[0] = float(ret) + self._exec_ar(self._time_buf[:1], *self._default_ar_config(), sym=True) + if self._time_buf[0].item() != 0: + continue + used.add(algo) + + # Warmup + for _ in range(self._TUNE_N_WARMUP): + run(algo, nb, nt) + + # CUDA-graph timed benchmark + cs = torch.cuda.Stream() + cs.wait_stream(torch.cuda.current_stream()) + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=cs): + for _ in range(self._TUNE_N_OPS_PER_GRAPH): + run(algo, nb, nt) + + start, end = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + start.record(cs) + with torch.cuda.stream(cs): + for _ in range(self._TUNE_N_GRAPH_LAUNCHES): + g.replay() + end.record(cs) + end.synchronize() + elapsed = start.elapsed_time(end) + + # Cross-rank timing sync + self._time_buf.fill_(elapsed) + torch.cuda.current_stream().wait_stream(cs) + self._exec_ar(self._time_buf, *self._default_ar_config(), sym=True) + avg = self._time_buf[self.rank].item() / self.world_size + + if avg < best_time: + best_time, best_cfg = avg, (algo, nb, nt) + + if best_cfg: + self._tune_cache[collective][target_size] = best_cfg + if self.rank == 0: + n = self._TUNE_N_GRAPH_LAUNCHES * self._TUNE_N_OPS_PER_GRAPH + print( + f"[tune] {collective} size={target_size}: {best_cfg[0].name} " + f"nb={best_cfg[1]} nt={best_cfg[2]} time={best_time / n * 1000:.2f}us", + flush=True, + ) + else: + fb = ( + self._default_ar_config() + if collective == "allreduce" + else ((self._ag_candidates()[0], 32, 512) if self._ag_candidates() else None) + ) + self._tune_cache[collective][target_size] = fb + + torch.cuda.synchronize() + self._barrier_internal() + for a in used: + a.reset() + + # -- public API -- + + def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, stream=None, accum_dtype=None): + sz = _round_pow2(tensor.nbytes) + if sz not in self._tune_cache["allreduce"]: + self._tune_size("allreduce", sz) + a, nb, nt = self._tune_cache["allreduce"][sz] + self._exec_ar( + tensor, a, nb, nt, op=_to_mscclpp_op(op), stream=stream, accum_dtype=accum_dtype, sym=self.symmetric_memory + ) + + def all_gather(self, output_tensor, input_tensor, stream=None): + sz = _round_pow2(input_tensor.nbytes) + if sz not in self._tune_cache["allgather"]: + self._tune_size("allgather", sz) + a, nb, nt = self._tune_cache["allgather"][sz] + self._exec_ag(input_tensor, output_tensor, a, nb, nt, stream=stream, sym=self.symmetric_memory) + + def barrier(self): + self._barrier_internal() + + def destroy(self): + self._algos.clear() + self._tune_cache = {"allreduce": {}, "allgather": {}} + self._tune_buf = self._time_buf = self._barrier_tensor = self._scratch = self.comm = None + + +# -- Benchmarks (standalone) -------------------------------------------------- + + +def _bench_sizes(low=5 * 1024, high=80 << 20): + sizes, c = [], low + while c <= high: + sizes.append(c) + c *= 2 + return sizes + + +def benchmark_allreduce( + comm: CustomizedComm, dtype=torch.float16, accum_dtype=None, n_warmup=10, n_graph_launches=10, n_iter=100 +): + sizes = _bench_sizes() + if comm.rank == 0: + print(f"\n{'='*60}\nAllreduce Benchmark\n{'='*60}") + print(f"{'Nelements':<18} {'Size(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}") + + cs = torch.cuda.Stream() + buf = _make_tensor(1 << 27, dtype) + buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0) + + for size in sizes: + nelems = size // buf.element_size() + t = buf[: size // buf.element_size()] + comm.all_reduce(t, accum_dtype=accum_dtype) + torch.cuda.synchronize() + + cs.wait_stream(torch.cuda.current_stream()) + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=cs): + for _ in range(n_iter): + comm.all_reduce(t, accum_dtype=accum_dtype) + with torch.cuda.stream(cs): + for _ in range(n_warmup): + g.replay() + comm.barrier() + cs.synchronize() + + s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + s.record(cs) + with torch.cuda.stream(cs): + for _ in range(n_graph_launches): + g.replay() + e.record(cs) + e.synchronize() + + ms = s.elapsed_time(e) / (n_graph_launches * n_iter) + if comm.rank == 0: + print(f"{nelems:<18} {size:<18} {ms*1000:<18.2f} {size/(ms*1e-3)/1e9:<18.2f}") + + +def benchmark_allgather(comm: CustomizedComm, dtype=torch.float16, n_warmup=10, n_graph_launches=10, n_iter=100): + sizes = _bench_sizes() + if comm.rank == 0: + print(f"\n{'='*60}\nAllgather Benchmark\n{'='*60}") + print(f"{'PerRank(B)':<18} {'Total(B)':<18} {'Time(us)':<18} {'AlgoBW(GB/s)':<18}") + + cs = torch.cuda.Stream() + buf = _make_tensor(1 << 27, dtype) + buf.normal_() if dtype in (torch.float16, torch.float32, torch.bfloat16) else buf.fill_(0) + + for prs in sizes: + total = prs * comm.world_size + if total > buf.nbytes: + break + nt = total // buf.element_size() + npr = prs // buf.element_size() + out = buf[:nt] + inp = out[comm.rank * npr : (comm.rank + 1) * npr] + + comm.all_gather(out, inp) + torch.cuda.synchronize() + + cs.wait_stream(torch.cuda.current_stream()) + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=cs): + for _ in range(n_iter): + comm.all_gather(out, inp) + with torch.cuda.stream(cs): + for _ in range(n_warmup): + g.replay() + comm.barrier() + cs.synchronize() + + s, e = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + s.record(cs) + with torch.cuda.stream(cs): + for _ in range(n_graph_launches): + g.replay() + e.record(cs) + e.synchronize() + + ms = s.elapsed_time(e) / (n_graph_launches * n_iter) + if comm.rank == 0: + print(f"{prs:<18} {total:<18} {ms*1000:<18.2f} {total/(ms*1e-3)/1e9:<18.2f}") + + +# -- Bootstrap & main --------------------------------------------------------- + + +def init_dist() -> mscclpp.CommGroup: + addr = os.environ.get("MSCCLPP_MASTER_ADDR") + if addr: + rank, world = int(os.environ["RANK"]), int(os.environ["WORLD_SIZE"]) + port = os.environ["MSCCLPP_MASTER_PORT"] + iface = _interfaces_for_ip(addr) + if not iface: + raise ValueError(f"No interface for {addr}") + return mscclpp.CommGroup(interfaceIpPortTrio=f"{iface}:{addr}:{port}", rank=rank, size=world) + import torch.distributed as dist + + dist.init_process_group(backend="gloo") + return mscclpp.CommGroup(torch_group=dist.group.WORLD) + + +def main(): + local = int(os.environ["LOCAL_RANK"]) + torch.cuda.set_device(local) + + dtype_str = os.environ.get("DTYPE", "float16") + dtype = getattr(torch, dtype_str, torch.float16) + accum_map = {"float32": mscclpp.DataType.float32, "float16": mscclpp.DataType.float16} + accum_str = os.environ.get("ACCUM_DTYPE") + accum_dtype = accum_map.get(accum_str) if accum_str else None + + comm_group = init_dist() + cc = CustomizedComm(comm_group) + + print(f"rank {local} starting benchmarks with dtype={dtype} accum_dtype={accum_dtype}...") + benchmark_allreduce(cc, dtype=dtype, accum_dtype=accum_dtype) + cc.barrier() + torch.cuda.synchronize() + + benchmark_allgather(cc, dtype=dtype) + cc.barrier() + torch.cuda.synchronize() + + cc.destroy() + print(f"rank {local} completed successfully.") + + +if __name__ == "__main__": + main() diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py index 975d3749..5a4dd1c4 100644 --- a/examples/torch-integration/dsl_with_nccl_api.py +++ b/examples/torch-integration/dsl_with_nccl_api.py @@ -1,19 +1,20 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# LD_PRELOAD=/build/lib/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py +# LD_PRELOAD=/build/lib/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py import os from typing import Any, Dict import torch, torch.distributed as dist -import mscclpp +import mscclpp.ext from mscclpp.language.collectives import AllReduce from mscclpp.language.channel import SwitchChannel, MemoryChannel, BufferType, SyncType from mscclpp.language.program import CollectiveProgram from mscclpp.language.rank import Rank +from mscclpp.language.utils import AlgoSpec -def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram: +def allreduce_nvls(spec: AlgoSpec) -> CollectiveProgram: gpu_size = spec.world_size with CollectiveProgram.from_spec(spec) as program: # Creating Channels @@ -63,8 +64,8 @@ def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram: return program -def setup_plan(algo_collection_builder: mscclpp.AlgorithmCollectionBuilder, rank: int, world_size: int): - spec = mscclpp.AlgoSpec( +def setup_plan(algo_collection_builder: mscclpp.ext.AlgorithmCollectionBuilder, rank: int, world_size: int): + spec = AlgoSpec( name="allreduce_nvls", collective=AllReduce(8, 1, True), nranks_per_node=8, @@ -94,10 +95,10 @@ def init_dist(): rank = int(os.environ["RANK"]) world = int(os.environ["WORLD_SIZE"]) local = int(os.environ["LOCAL_RANK"]) - algorithm_collection_builder = mscclpp.AlgorithmCollectionBuilder() + algorithm_collection_builder = mscclpp.ext.AlgorithmCollectionBuilder() setup_plan(algorithm_collection_builder, rank, world) algorithm_collection_builder.set_algorithm_selector(selector) - dist.init_process_group(backend="nccl", device_id=local) + dist.init_process_group(backend="nccl", device_id=torch.device("cuda", local)) return rank, world, local diff --git a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu index 0e2ab5ad..f3c69b72 100644 --- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu +++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu @@ -9,7 +9,7 @@ #include template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) { } } -__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedWait(); @@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in } } -__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedSignal(); @@ -88,7 +88,7 @@ int main() { mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1); mscclpp::BaseMemoryChannel memChan0(sema0); mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle(); - void *devHandle0; + void* devHandle0; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice)); @@ -98,14 +98,14 @@ int main() { mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0); mscclpp::BaseMemoryChannel memChan1(sema1); mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle(); - void *devHandle1; + void* devHandle1; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice)); log("GPU 0: Launching gpuKernel0 ..."); MSCCLPP_CUDATHROW(cudaSetDevice(0)); - gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle0), iter); + gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle0), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); log("GPU 1: Launching gpuKernel1 ..."); @@ -115,7 +115,7 @@ int main() { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); MSCCLPP_CUDATHROW(cudaEventRecord(start)); - gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle1), iter); + gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle1), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaEventRecord(end)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); diff --git a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu index 05eb1b25..0526407e 100644 --- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu +++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu @@ -14,7 +14,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) { } } -__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedWait(); @@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in } } -__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedSignal(); @@ -115,14 +115,14 @@ void worker(int gpuId) { mscclpp::BaseMemoryChannel memChan(sema); auto memChanHandle = memChan.deviceHandle(); - void *devHandle; + void* devHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice)); log("GPU ", gpuId, ": Launching a GPU kernel ..."); if (gpuId == 0) { - gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle), iter); + gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); } else { @@ -130,7 +130,7 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); MSCCLPP_CUDATHROW(cudaEventRecord(start)); - gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle), iter); + gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaEventRecord(end)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu index e9007612..a1be59f2 100644 --- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu +++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu @@ -16,7 +16,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -47,7 +47,7 @@ int wait_process(int pid) { __device__ mscclpp::DeviceSyncer devSyncer; -__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->relaxedSignal(); @@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si } } -__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->relaxedSignal(); @@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x); } -__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank, +__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank, uint32_t flag) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { @@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag); } -void worker(int gpuId) { +void worker(int myRank, int gpuId, const std::string& ipPort) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); - const int myRank = gpuId; const int remoteRank = myRank == 0 ? 1 : 0; const int nRanks = 2; const int iter = 1000; @@ -105,11 +104,11 @@ void worker(int gpuId) { const size_t bufferBytes = 256 * 1024 * 1024; const size_t pktBufferBytes = 256 * 1024 * 1024; - log("GPU ", gpuId, ": Preparing for tests ..."); + log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ..."); // Build a connection and a semaphore auto bootstrap = std::make_shared(myRank, nRanks); - bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER); + bootstrap->initialize(ipPort); mscclpp::Communicator comm(bootstrap); auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get(); auto sema = comm.buildSemaphore(conn, remoteRank).get(); @@ -133,8 +132,8 @@ void worker(int gpuId) { auto memChanHandle = memChan.deviceHandle(); auto memPktChanHandle = memPktChan.deviceHandle(); - void *devHandle; - void *devPktHandle; + void* devHandle; + void* devPktHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle))); MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice)); @@ -146,23 +145,23 @@ void worker(int gpuId) { std::function kernels[3]; kernels[0] = [&](size_t copyBytes) { - bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), - copyBytes, myRank); + bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + myRank); }; kernels[1] = [&](size_t copyBytes) { - bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), - copyBytes, myRank); + bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + myRank); }; kernels[2] = [&](size_t copyBytes) { static uint32_t flag = 1; - bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devPktHandle), + bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devPktHandle), copyBytes, myRank, flag++); }; cudaEvent_t start, end; - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); } @@ -189,13 +188,13 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); bootstrap->barrier(); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(start, stream)); } MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream)); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(end, stream)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); float elapsedTime; @@ -204,8 +203,8 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end)); elapsedTimePerIter = elapsedTime / iter; gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f; - log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ", - gbps, " GB/s"); + log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, + " ms/iter, BW ", gbps, " GB/s"); } MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec)); @@ -216,23 +215,47 @@ void worker(int gpuId) { bootstrap->barrier(); } -int main() { - int pid0 = spawn_process([]() { worker(0); }); - int pid1 = spawn_process([]() { worker(1); }); - if (pid0 < 0 || pid1 < 0) { - log("Failed to spawn processes."); +int main(int argc, char** argv) { + if (argc == 1) { + int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); }); + int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); }); + if (pid0 < 0 || pid1 < 0) { + log("Failed to spawn processes."); + return -1; + } + int status0 = wait_process(pid0); + int status1 = wait_process(pid1); + if (status0 < 0 || status1 < 0) { + log("Failed to wait for processes."); + return -1; + } + if (status0 != 0 || status1 != 0) { + log("One of the processes failed."); + return -1; + } + log("Succeed!"); + return 0; + } else if (argc == 4) { + std::string ipPort = argv[1]; + int rank, gpuId; + try { + rank = std::stoi(argv[2]); + gpuId = std::stoi(argv[3]); + } catch (const std::exception&) { + log("Error: rank and gpu_id must be valid integers."); + return -1; + } + if (rank < 0 || rank > 2 || gpuId < 0) { + log("Error: rank must be between 0 and 1 and gpu_id must be non-negative."); + return -1; + } + worker(rank, gpuId, ipPort); + log("Rank ", rank, ": Succeed!"); + return 0; + } else { + std::cerr << "Usage:\n" + << " " << argv[0] << " Run in intra-node mode\n" + << " " << argv[0] << " Run in inter-node mode\n"; return -1; } - int status0 = wait_process(pid0); - int status1 = wait_process(pid1); - if (status0 < 0 || status1 < 0) { - log("Failed to wait for processes."); - return -1; - } - if (status0 != 0 || status1 != 0) { - log("One of the processes failed."); - return -1; - } - log("Succeed!"); - return 0; } diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu index 46064581..9e6d61dd 100644 --- a/examples/tutorials/04-port-channel/bidir_port_channel.cu +++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu @@ -16,7 +16,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -45,7 +45,7 @@ int wait_process(int pid) { return -1; } -__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->signal(); @@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size } } -void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) { +void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); const int myRank = rank; const int remoteRank = myRank == 0 ? 1 : 0; @@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t auto portChanHandle = portChan.deviceHandle(); - void *devHandle; + void* devHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice)); @@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t std::function kernels[1]; kernels[0] = [&](size_t copyBytes) { - bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, myRank); }; @@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t bootstrap->barrier(); } -mscclpp::Transport parseTransport(const std::string &transportStr) { +mscclpp::Transport parseTransport(const std::string& transportStr) { if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc; if (transportStr == "IB0") return mscclpp::Transport::IB0; if (transportStr == "IB1") return mscclpp::Transport::IB1; @@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) { throw std::runtime_error("Unknown transport: " + transportStr); } -int main(int argc, char **argv) { +int main(int argc, char** argv) { if (argc == 1) { int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); diff --git a/examples/tutorials/05-switch-channel/Makefile b/examples/tutorials/05-switch-channel/Makefile new file mode 100644 index 00000000..1a211f64 --- /dev/null +++ b/examples/tutorials/05-switch-channel/Makefile @@ -0,0 +1,15 @@ +CUDA_HOME ?= /usr/local/cuda + +COMPILER := $(CUDA_HOME)/bin/nvcc +ARCH_FLAG := -arch=native + +TARGET = bidir_switch_channel +SRC = bidir_switch_channel.cu + +all: $(TARGET) + +$(TARGET): $(SRC) + $(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp + +clean: + rm -f $(TARGET) diff --git a/examples/tutorials/05-switch-channel/bidir_switch_channel.cu b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu new file mode 100644 index 00000000..658e6f05 --- /dev/null +++ b/examples/tutorials/05-switch-channel/bidir_switch_channel.cu @@ -0,0 +1,177 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT_NUMBER "50505" + +template +void log(Args &&...args) { + std::stringstream ss; + (ss << ... << args); + ss << std::endl; + std::cout << ss.str(); +} + +int spawn_process(std::function func) { + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + // Child process + func(); + exit(0); + } + return pid; +} + +int wait_process(int pid) { + int status; + if (waitpid(pid, &status, 0) < 0) { + return -1; + } + if (WIFEXITED(status)) { + return WEXITSTATUS(status); + } + return -1; +} + +__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan; + +__device__ mscclpp::DeviceSyncer devSyncer; + +__global__ void kernelSwitchReduce(int rank, int numElements) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + // rank 0 performs on first half of data and rank 1 on second half + int min = rank * (numElements / 2); + int max = (rank + 1) * (numElements / 2); + + for (int i = tid + min; i < max; i += stride) { + auto val = gConstSwitchChan.reduce(i); + gConstSwitchChan.broadcast(i, val); + } +} + +void worker(int myRank, int gpuId, const std::string &ipPort) { + MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); + const int nRanks = 2; + const int iter = 1000; + const size_t bufferBytes = 128 * 1024 * 1024; + + log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ..."); + + // Build a connection and a semaphore + auto bootstrap = std::make_shared(myRank, nRanks); + bootstrap->initialize(ipPort); + std::shared_ptr comm = std::make_shared(bootstrap); + + std::vector ranks; + ranks.reserve(nRanks); + for (int i = 0; i < nRanks; i++) ranks.push_back(i); + + auto buffer = mscclpp::GpuBuffer(bufferBytes); + + auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes); + + auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes); + + auto deviceHandle = switchChannel.deviceHandle(); + + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle))); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + // Call the kernel in a loop for perf evaluation + + for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) { + cudaEvent_t start, end; + if (myRank == 0) { + MSCCLPP_CUDATHROW(cudaEventCreate(&start)); + MSCCLPP_CUDATHROW(cudaEventCreate(&end)); + } + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + bootstrap->barrier(); + + if (myRank == 0) { + MSCCLPP_CUDATHROW(cudaEventRecord(start, 0)); + } + + for (int i = 0; i < iter; ++i) { + kernelSwitchReduce<<<256, 1024>>>(myRank, numElements); + } + + MSCCLPP_CUDATHROW(cudaGetLastError()); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + comm->bootstrap()->barrier(); + + if (myRank == 0) { + MSCCLPP_CUDATHROW(cudaEventRecord(end, 0)); + MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); + float elapsedTime; + float elapsedTimePerIter; + float gbps; + MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end)); + elapsedTimePerIter = elapsedTime / iter; + float dataSize = numElements * 4; + gbps = dataSize / elapsedTimePerIter * 1e-6f; + log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ", + gbps, " GB/s"); + } + } +} + +int main(int argc, char **argv) { + if (argc == 1) { + int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); }); + int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); }); + if (pid0 < 0 || pid1 < 0) { + log("Failed to spawn processes."); + return -1; + } + int status0 = wait_process(pid0); + int status1 = wait_process(pid1); + if (status0 < 0 || status1 < 0) { + log("Failed to wait for processes."); + return -1; + } + if (status0 != 0 || status1 != 0) { + log("One of the processes failed."); + return -1; + } + log("Succeed!"); + return 0; + } else if (argc == 4) { + std::string ipPort = argv[1]; + int rank, gpuId; + try { + rank = std::stoi(argv[2]); + gpuId = std::stoi(argv[3]); + } catch (const std::exception &) { + log("Error: rank and gpu_id must be valid integers."); + return -1; + } + if (rank < 0 || rank > 2 || gpuId < 0) { + log("Error: rank must be between 0 and 1 and gpu_id must be non-negative."); + return -1; + } + worker(rank, gpuId, ipPort); + log("Rank ", rank, ": Succeed!"); + return 0; + } else { + std::cerr << "Usage:\n" + << " " << argv[0] << " Run in intra-node mode\n" + << " " << argv[0] << " Run in inter-node mode\n"; + return -1; + } +} diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp index 7acdb8b8..531cb857 100644 --- a/include/mscclpp/algorithm.hpp +++ b/include/mscclpp/algorithm.hpp @@ -84,6 +84,11 @@ class Algorithm { /// @return The Constraint struct specifying worldSize and nRanksPerNode requirements. virtual Constraint constraint() const = 0; + /// Set the valid message size range for this algorithm. + /// @param minMessageSize Minimum supported message size in bytes. + /// @param maxMessageSize Maximum supported message size in bytes. + virtual void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) = 0; + /// Execute the algorithm. /// @param comm The communicator to use. /// @param input Pointer to the input buffer. @@ -96,12 +101,16 @@ class Algorithm { /// @param executor The executor for DSL algorithms (may be nullptr for native). /// @param nBlocks Number of CUDA blocks (0 for auto-selection). /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection). + /// @param symmetricMemory Whether to use symmetric memory optimization. /// @param extras Additional parameters for algorithm-specific customization. + /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype. /// @return The result of the operation. virtual CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, - const std::unordered_map& extras = {}) = 0; + bool symmetricMemory = false, + const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) = 0; /// Reset the algorithm state, clearing any cached contexts. virtual void reset() = 0; @@ -179,10 +188,11 @@ class NativeAlgorithm : public Algorithm { /// @param nBlocks Number of CUDA blocks. /// @param nThreadsPerBlock Number of threads per block. /// @param extras Additional algorithm-specific parameters. + /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel). /// @return The result of the operation. using KernelFunc = std::function, const void*, void*, size_t, size_t, DataType, ReduceOp, - cudaStream_t, int, int, const std::unordered_map&)>; + cudaStream_t, int, int, const std::unordered_map&, DataType)>; /// Function type for creating algorithm contexts. /// @param comm The communicator. @@ -201,9 +211,10 @@ class NativeAlgorithm : public Algorithm { /// @param inputSize Size of the input buffer. /// @param outputSize Size of the output buffer. /// @param dtype Data type of the elements. + /// @param symmetricMemory Whether symmetric memory is enabled. /// @return A key uniquely identifying this buffer configuration. using ContextKeyGenFunc = std::function; + size_t outputSize, DataType dtype, bool symmetricMemory)>; /// Construct a NativeAlgorithm. /// @param name Human-readable name of the algorithm. @@ -225,10 +236,12 @@ class NativeAlgorithm : public Algorithm { CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, - const std::unordered_map& extras = {}) override; + bool symmetricMemory = false, const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) override; const std::string& name() const override; const std::string& collective() const override; const std::pair& messageRange() const override; + void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override; const std::unordered_map& tags() const override; const CollectiveBufferMode& bufferMode() const override; AlgorithmType type() const override { return AlgorithmType::Native; } @@ -269,12 +282,14 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab const std::string& name() const override; const std::string& collective() const override; const std::pair& messageRange() const override; + void setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) override; const std::unordered_map& tags() const override; const CollectiveBufferMode& bufferMode() const override; CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, - const std::unordered_map& extras = {}) override; + bool symmetricMemory = false, const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) override; AlgorithmType type() const override { return AlgorithmType::DSL; } Constraint constraint() const override; void reset() override; @@ -299,6 +314,7 @@ struct CollectiveRequest { const void* inputBuffer; void* outputBuffer; size_t messageSize; + cudaStream_t stream; const std::string& collective; const DataType dtype; const std::unordered_map>& hints; @@ -358,6 +374,10 @@ class AlgorithmCollection { AlgoSelectFunc fallbackAlgoSelector_ = nullptr; }; +/// Get a default GPU flag buffer (allocated once and reused). +/// @return A pair of (shared_ptr to the flag buffer, size in bytes). +std::pair, size_t> getFlagBuffer(); + } // namespace mscclpp #endif // MSCCLPP_ALGORITHM_HPP_ \ No newline at end of file diff --git a/include/mscclpp/assert_device.hpp b/include/mscclpp/assert_device.hpp index bf982ba6..1b9cb611 100644 --- a/include/mscclpp/assert_device.hpp +++ b/include/mscclpp/assert_device.hpp @@ -19,11 +19,11 @@ #else // defined(DEBUG_BUILD) #if defined(MSCCLPP_DEVICE_HIP) -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, - const char *__function); +extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line, + const char* __function); #else // !defined(MSCCLPP_DEVICE_HIP) -extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, - const char *__function) __THROW; +extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line, + const char* __function) __THROW; #endif // !defined(MSCCLPP_DEVICE_HIP) /// Assert a condition on the device and print a message if the condition is false. diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp index 74f6122f..d00bb50c 100644 --- a/include/mscclpp/atomic_device.hpp +++ b/include/mscclpp/atomic_device.hpp @@ -38,7 +38,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_o return cuda::atomic_ref{*ptr}.fetch_add(val, memoryOrder); } -#elif defined(MSCCLPP_DEVICE_HIP) +#else // !defined(MSCCLPP_DEVICE_CUDA) constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED; constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE; @@ -46,7 +46,6 @@ constexpr auto memoryOrderRelease = __ATOMIC_RELEASE; constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL; constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST; -// HIP does not have thread scope enums like CUDA constexpr auto scopeSystem = 0; constexpr auto scopeDevice = 0; @@ -65,7 +64,7 @@ MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrde return __atomic_fetch_add(ptr, val, memoryOrder); } -#endif // defined(MSCCLPP_DEVICE_HIP) +#endif // !defined(MSCCLPP_DEVICE_CUDA) } // namespace mscclpp diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 38b05ccf..ca2fc34f 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -381,11 +381,19 @@ struct EndpointConfig { /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other /// transports. struct Ib { + /// IB mode for signaling, used to select between different implementations. + enum class Mode { + Default, // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset). + Host, // Use the host stack with RDMA atomics. + HostNoAtomic // Use the host stack with write-with-immediate signaling (no RDMA atomics). + }; + static constexpr int DefaultPort = -1; - static constexpr int DefaultGidIndex = 0; + static constexpr int DefaultGidIndex = -1; static constexpr int DefaultMaxCqSize = 1024; static constexpr int DefaultMaxCqPollNum = 1; static constexpr int DefaultMaxSendWr = 8192; + static constexpr int DefaultMaxRecvWr = 16; static constexpr int DefaultMaxWrPerSend = 64; /// Device index. Currently ignored; use transport type (IB0-IB7) to select device. @@ -394,32 +402,41 @@ struct EndpointConfig { int port; /// GID index. int gidIndex; - /// Maximum size of the completion queue. + /// Maximum size of the send completion queue. int maxCqSize; - /// Maximum number of completion queue polls per operation. + /// Maximum number of send completion queue polls per operation. int maxCqPollNum; /// Maximum number of outstanding send work requests. int maxSendWr; + /// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate). + int maxRecvWr; /// Maximum number of work requests per send operation. int maxWrPerSend; + /// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable. + Mode mode; /// Constructor. /// @param deviceIndex Device index. /// @param port Port number. - /// @param gidIndex GID index. - /// @param maxCqSize Maximum completion queue size. - /// @param maxCqPollNum Maximum completion queue poll count. + /// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable. + /// @param maxCqSize Maximum send completion queue size. + /// @param maxCqPollNum Maximum send completion queue poll count. /// @param maxSendWr Maximum outstanding send work requests. + /// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode). /// @param maxWrPerSend Maximum work requests per send operation. + /// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable). Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize, - int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend) + int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr, + int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default) : deviceIndex(deviceIndex), port(port), gidIndex(gidIndex), maxCqSize(maxCqSize), maxCqPollNum(maxCqPollNum), maxSendWr(maxSendWr), - maxWrPerSend(maxWrPerSend) {} + maxRecvWr(maxRecvWr), + maxWrPerSend(maxWrPerSend), + mode(mode) {} }; /// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet). @@ -658,6 +675,7 @@ class Connection { friend class SemaphoreStub; friend class Semaphore; friend class ProxyService; + friend class BaseConnection; }; /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 5972234b..a6dd306b 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -54,6 +54,12 @@ class Env { /// default libibverbs library found in the system. const std::string ibvSo; + /// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel. + /// Allowed values: + /// - "host": use the host stack with RDMA atomics (default). + /// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics). + const std::string ibvMode; + /// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname. /// This is used to determine whether the host is the same across different processes. const std::string hostid; @@ -70,9 +76,9 @@ class Env { /// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this. const std::string commId; - /// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to - /// use execution plans for the NCCL API. Unset by default. - const std::string executionPlanDir; + /// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files. + /// If unset, it defaults to `~/.cache/mscclpp`. + const std::string cacheDir; /// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be /// enabled and will dump traces to this directory. Unset by default. @@ -92,17 +98,27 @@ class Env { /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`. const std::string forceNcclFallbackOperation; - /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs. - /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with - /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling - /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation. + /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory + /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the + /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections + /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs + /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator. /// Default is false. - const bool disableChannelCache; + const bool ncclSymmetricMemory; /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++. /// Default is false. const bool forceDisableNvls; + /// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++. + /// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded. + /// Default is false. + const bool forceDisableGdr; + + /// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport. + /// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified). + const int ibGidIndex; + private: Env(); diff --git a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp index 201d7440..394e8014 100644 --- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp +++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp @@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder { /// @return The built AlgorithmCollection containing all registered algorithms. AlgorithmCollection build(); - AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank); + AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, + size_t flagBufferSize, int rank); private: AlgorithmCollectionBuilder() = default; @@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder { AlgoSelectFunc algoSelector_ = nullptr; AlgoSelectFunc fallbackAlgoSelector_ = nullptr; - AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize); + AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, + uintptr_t flagBuffer, size_t flagBufferSize); AlgorithmCollection buildDefaultDslAlgorithms(int rank); static std::shared_ptr gAlgorithmCollectionBuilder_; diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 6a0929aa..b8d096e2 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t; using cudaDeviceProp = hipDeviceProp_t; using cudaStream_t = hipStream_t; using cudaStreamCaptureMode = hipStreamCaptureMode; +using cudaStreamCaptureStatus = hipStreamCaptureStatus; using cudaMemcpyKind = hipMemcpyKind; using cudaIpcMemHandle_t = hipIpcMemHandle_t; @@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal; constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed; +constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone; +constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive; +constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated; constexpr auto cudaHostAllocMapped = hipHostMallocMapped; constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined; constexpr auto cudaMemcpyDefault = hipMemcpyDefault; @@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__) #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__) +#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__) #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__) #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__) #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__) diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp index 99b95d9a..41bd5928 100644 --- a/include/mscclpp/gpu_data_types.hpp +++ b/include/mscclpp/gpu_data_types.hpp @@ -16,20 +16,27 @@ using __bfloat16 = __hip_bfloat16; using __bfloat162 = __hip_bfloat162; #define __CUDA_BF16_TYPES_EXIST__ -// AMD FP8 support - hip_fp8.h provides __hip_fp8_e4m3_fnuz and __hip_fp8_e5m2_fnuz -// Only available on gfx942 and newer architectures (ROCm 6.0+) +// AMD FP8 support - Use fnuz types for HIP 6.0 or when HIP_FP8_TYPE_FNUZ is enabled and HIP_FP8_TYPE_OCP is not +// enabled. Otherwise, use the standard FP8 types. #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6) #include // Create aliases matching CUDA naming convention for cross-platform compatibility +#if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP) using __fp8_e4m3 = __hip_fp8_e4m3_fnuz; using __fp8_e5m2 = __hip_fp8_e5m2_fnuz; - -// HIP FP8 vector types use storage types (from hip/amd_detail/amd_hip_fp8.h): -using __fp8x2_e4m3 = __hip_fp8x2_storage_t; // uint16_t -using __fp8x2_e5m2 = __hip_fp8x2_storage_t; // uint16_t -using __fp8x4_e4m3 = __hip_fp8x4_storage_t; // uint32_t -using __fp8x4_e5m2 = __hip_fp8x4_storage_t; // uint32_t +using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz; +using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz; +using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz; +using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz; +#else +using __fp8_e4m3 = __hip_fp8_e4m3; +using __fp8_e5m2 = __hip_fp8_e5m2; +using __fp8x2_e4m3 = __hip_fp8x2_e4m3; +using __fp8x2_e5m2 = __hip_fp8x2_e5m2; +using __fp8x4_e4m3 = __hip_fp8x4_e4m3; +using __fp8x4_e5m2 = __hip_fp8x4_e5m2; +#endif #define __FP8_TYPES_EXIST__ #endif // HIP_VERSION_MAJOR >= 6 @@ -57,24 +64,156 @@ using __bfloat162 = __nv_bfloat162; #endif +/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15. +/// Format (MSB first): [sign:1][exponent:4][mantissa:3] +/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention). +/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6. +struct alignas(1) __fp8_e4m3b15 { + uint8_t __x; + + __fp8_e4m3b15() = default; + + /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity). + MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {} + + /// Construct from float32 (explicit to avoid ambiguous conversion chains). + MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {} + + /// Convert to float32. + MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); } + + /// Construct from a raw bit pattern without conversion. + static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) { + __fp8_e4m3b15 r; + r.__x = bits; + return r; + } + + private: + /// Decode fp8_e4m3b15 bits → float32. + /// + /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler. + /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7. + /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8, + /// then convert fp16 → float32. + static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) { + // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN. + uint32_t exp = (bits >> 3) & 0xFu; + if (bits == 0x80 || exp == 15) { + union { + uint32_t u; + float f; + } nan_val = {0x7FC00000u}; + return nan_val.f; + } + if (bits == 0) return 0.0f; + + // Triton-style bit manipulation: fp8 → fp16 → fp32. + // fp8 layout: [S:1][E:4][M:3] (bias=15) + // fp16 layout: [S:1][E:5][M:10] (bias=15) + // + // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1 + // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15. + // Refer: + // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34 + uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16 + uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position + uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign) + uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1 + + // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0 + // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents + // the subnormal fp16 value since both share bias=15. + + // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP). + union { + uint16_t u; + __half h; + } cvt = {fp16_bits}; + return __half2float(cvt.h); + } + + /// Encode float32 → fp8_e4m3b15 bits. + /// + /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8. + /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15), + /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1. + static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) { + union { + float f; + uint32_t u; + } in = {val}; + + // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz). + if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u; + + // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP). + __half h_val = __float2half_rn(val); + union { + __half h; + uint16_t u; + } cvt = {h_val}; + uint16_t fp16_bits = cvt.u; + + // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80. + uint16_t abs_fp16 = fp16_bits & 0x7FFFu; + if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u; + + // Reconstruct with sign. + uint16_t sign16 = fp16_bits & 0x8000u; + + // Triton-style: fp16 → fp8. + // fp16 layout: [S:1][E:5][M:10] (bias=15) + // fp8 layout: [S:1][E:4][M:3] (bias=15) + // + // mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080) + // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias. + // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0 + // Finally: prmt for byte extraction. + // + // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte. + uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u); + // The upper byte now contains [E:4][M:3][round_bit]. + // Combine with sign and extract. + uint16_t with_sign = sign16 | adjusted; + uint8_t result = (uint8_t)(with_sign >> 8); + + // Zero → 0x00 (ensure positive zero, not negative zero which is NaN). + if ((result & 0x7Fu) == 0) result = 0x00u; + + return result; + } +}; + +/// Packed 2x fp8_e4m3b15 storage. +struct alignas(2) __fp8x2_e4m3b15 { + uint16_t __x; +}; + +/// Packed 4x fp8_e4m3b15 storage. +struct alignas(4) __fp8x4_e4m3b15 { + uint32_t __x; +}; + namespace mscclpp { /// Data types supported by mscclpp operations. enum class DataType { - INT32, // 32-bit signed integer. - UINT32, // 32-bit unsigned integer. - FLOAT16, // IEEE 754 half precision. - FLOAT32, // IEEE 754 single precision. - BFLOAT16, // bfloat16 precision. - FP8_E4M3, // FP8 with E4M3 layout. - FP8_E5M2, // FP8 with E5M2 layout. + INT32, // 32-bit signed integer. + UINT32, // 32-bit unsigned integer. + FLOAT16, // IEEE 754 half precision. + FLOAT32, // IEEE 754 single precision. + BFLOAT16, // bfloat16 precision. + FLOAT8_E4M3, // float8 with E4M3 layout. + FLOAT8_E5M2, // float8 with E5M2 layout. + UINT8, // 8-bit unsigned integer. + FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel). + AUTO = 255, // Sentinel: resolve to the input dtype at runtime. }; /// Word array. -template +template = 4 && Bytes % 4 == 0)> struct alignas(Bytes) Words { - static_assert(Bytes > 0, "Bytes must be greater than 0"); - static_assert(Bytes % 4 == 0, "Bytes must be multiple of 4"); uint32_t w[Bytes / 4]; MSCCLPP_HOST_DEVICE_INLINE Words() {} @@ -84,18 +223,34 @@ struct alignas(Bytes) Words { MSCCLPP_HOST_DEVICE_INLINE const uint32_t& operator[](int i) const { return w[i]; } }; -/// Vector type. -template -union alignas(sizeof(T) * N) VectorType { +template +struct alignas(Bytes) Words {}; + +/// Vector type implementation (internal). +template +union alignas(sizeof(T) * N) VectorTypeImpl { static_assert(N > 0, "N must be greater than 0"); + static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size"); T data[N]; Words words; + StorageT storage; using ElementType = T; constexpr static int Size = N; - MSCCLPP_HOST_DEVICE_INLINE VectorType() {} + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl() {} + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const StorageT& value) : storage(value) {} + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const VectorTypeImpl& other) { storage = other.storage; } + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl& operator=(const VectorTypeImpl& other) { + storage = other.storage; + return *this; + } + + MSCCLPP_HOST_DEVICE_INLINE operator StorageT() const { return storage; } MSCCLPP_HOST_DEVICE_INLINE operator T*() { return data; } @@ -106,38 +261,1109 @@ union alignas(sizeof(T) * N) VectorType { MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; } }; -using i32x1 = VectorType; -using u32x1 = VectorType; -using f64x1 = VectorType; -using f32x1 = VectorType; +// Helper template to get the appropriate vector type for a given element type and count. +template +struct VectorTypeHelper { + static constexpr int Bytes = N * sizeof(T); + using type = VectorTypeImpl< + T, N, + std::conditional_t>>>>; +}; -using i32x2 = VectorType; -using u32x2 = VectorType; -using f32x2 = VectorType; -using f16x2 = VectorType<__half, 2>; -using bf16x2 = VectorType<__bfloat16, 2>; +/// Vector type - clean user interface (automatically selects appropriate storage type) +template +using VectorType = typename VectorTypeHelper::type; -using i32x4 = VectorType; -using u32x4 = VectorType; -using f32x4 = VectorType; -using f16x4 = VectorType<__half, 4>; -using bf16x4 = VectorType<__bfloat16, 4>; +// Macro to define specialization AND alias in one go +#define DEFINE_VEC(Alias, T, N, Storage) \ + template <> \ + struct VectorTypeHelper { \ + using type = VectorTypeImpl; \ + }; \ + using Alias = VectorType -using f16x8 = VectorType<__half, 8>; -using bf16x8 = VectorType<__bfloat16, 8>; +DEFINE_VEC(i32x1, int32_t, 1, int32_t); +DEFINE_VEC(u32x1, uint32_t, 1, uint32_t); +DEFINE_VEC(f32x1, float, 1, float); +DEFINE_VEC(f64x1, double, 1, double); + +DEFINE_VEC(i32x2, int32_t, 2, int2); +DEFINE_VEC(u32x2, uint32_t, 2, uint2); +DEFINE_VEC(u8x2, uint8_t, 2, uint16_t); +DEFINE_VEC(f32x2, float, 2, float2); +DEFINE_VEC(f16x2, __half, 2, __half2); +DEFINE_VEC(bf16x2, __bfloat16, 2, __bfloat162); + +DEFINE_VEC(i32x4, int32_t, 4, int4); +DEFINE_VEC(u32x4, uint32_t, 4, uint4); +DEFINE_VEC(u8x4, uint8_t, 4, uint32_t); +DEFINE_VEC(f32x4, float, 4, float4); +DEFINE_VEC(f16x4, __half, 4, uint2); +DEFINE_VEC(bf16x4, __bfloat16, 4, uint2); + +DEFINE_VEC(f16x8, __half, 8, uint4); +DEFINE_VEC(bf16x8, __bfloat16, 8, uint4); + +// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists. +using f32x8 = VectorType; +using f32x16 = VectorType; +using f16x16 = VectorType<__half, 16>; #if defined(__FP8_TYPES_EXIST__) -// FP8 vector types -using fp8_e4m3x2 = VectorType<__fp8_e4m3, 2>; -using fp8_e4m3x4 = VectorType<__fp8_e4m3, 4>; -using fp8_e4m3x8 = VectorType<__fp8_e4m3, 8>; -using fp8_e4m3x16 = VectorType<__fp8_e4m3, 16>; -using fp8_e5m2x2 = VectorType<__fp8_e5m2, 2>; -using fp8_e5m2x4 = VectorType<__fp8_e5m2, 4>; -using fp8_e5m2x8 = VectorType<__fp8_e5m2, 8>; -using fp8_e5m2x16 = VectorType<__fp8_e5m2, 16>; +DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3); +DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3); +DEFINE_VEC(f8_e4m3x8, __fp8_e4m3, 8, uint2); +DEFINE_VEC(f8_e4m3x16, __fp8_e4m3, 16, uint4); + +DEFINE_VEC(f8_e5m2x2, __fp8_e5m2, 2, __fp8x2_e5m2); +DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2); +DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2); +DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4); #endif +// fp8_e4m3b15 vectors (always available — software type, no HW dependency) +DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15); +DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15); +DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2); +DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4); +#undef DEFINE_VEC + +#if defined(MSCCLPP_DEVICE_COMPILE) +template +MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u{.f = src}; + return u.t; +} + +template +MSCCLPP_DEVICE_INLINE T clip(T val) { + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __half clip(__half val) { + val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); + val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __half2 clip(__half2 val) { + val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); + val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); + val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); + val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __bfloat16 clip(__bfloat16 val) { + val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); + val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __bfloat162 clip(__bfloat162 val) { + val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80)); + val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80)); + return val; +} + +// FP8 E4M3 clipping function +#if defined(__FP8_TYPES_EXIST__) +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3 clip(__fp8_e4m3 val) { + // FP8 E4M3 has range [-448, 448], no infinities + // Built-in saturation in FP8 arithmetic + return val; +} + +// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value +template <> +MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) { + // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow + // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB) + float fval = float(val); + fval = fmaxf(fval, -57344.0f); + fval = fminf(fval, 57344.0f); + return __fp8_e5m2(fval); +} +#endif + +// --- f32x2 arithmetic --- + +template +MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) { +#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000) + // Blackwell (SM 10.0+): packed float2 add in a single instruction. + return __fadd2_rn(a.storage, b.storage); +#else + f32x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) { + __half2 result; + if constexpr (UseClip) { + result = clip(__hadd2(a, b)); + } else { + result = __hadd2(a, b); + } + return result; +} + +template +MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) { + // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd). + const f16x2* a2 = reinterpret_cast(&a); + const f16x2* b2 = reinterpret_cast(&b); + f16x4 result; + f16x2* r2 = reinterpret_cast(&result); + r2[0] = a2[0] + b2[0]; + r2[1] = a2[1] + b2[1]; + return result; +} + +template +MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) { + __bfloat162 result; + if constexpr (UseClip) { + result = clip(__hadd2(a, b)); + } else { + result = __hadd2(a, b); + } + return result; +} + +#if defined(__FP8_TYPES_EXIST__) +template +MSCCLPP_DEVICE_INLINE __fp8_e4m3 operator+(const __fp8_e4m3& a, const __fp8_e4m3& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // Optimized assembly for gfx942 + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); + return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false)); +#elif defined(MSCCLPP_DEVICE_CUDA) + // NVIDIA CUDA FP8 addition (CUDA 11.8+) + __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b))); + return UseClip ? clip(result) : result; +#else + // Fallback for other devices + __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b)); + return UseClip ? clip(result) : result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3x2 operator+(const f8_e4m3x2& a, const f8_e4m3x2& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, 0)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, 0))); + return bit_cast( + static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false))); +#elif defined(MSCCLPP_DEVICE_CUDA) + // CUDA: Convert to half2, add using optimized __hadd2, convert back + return __fp8x2_e4m3(__hadd2(__half2(static_cast<__fp8x2_e4m3>(a)), __half2(static_cast<__fp8x2_e4m3>(b)))); +#else + // Fallback for other devices: element-wise using single-element operations + f8_e4m3x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3x4 operator+(const f8_e4m3x4& a, const f8_e4m3x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v_low, v_high; + // E4M3 using fp8 conversion - process low word (false) and high word (true) + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_low) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, false)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, false))); + uint32_t result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, 0, false); + + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_high) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, true)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, true))); + result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, result_packed, true); + return bit_cast(result_packed); +#else + // Process as two f8_e4m3x2 using operator+ for 2 elements + const f8_e4m3x2* a_pair = reinterpret_cast(&a); + const f8_e4m3x2* b_pair = reinterpret_cast(&b); + + f8_e4m3x2 result[2]; + result[0] = a_pair[0] + b_pair[0]; + result[1] = a_pair[1] + b_pair[1]; + + return *reinterpret_cast(result); +#endif +} + +template +MSCCLPP_DEVICE_INLINE __fp8_e5m2 operator+(const __fp8_e5m2& a, const __fp8_e5m2& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // Optimized assembly for gfx942 (bfloat8) + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); + return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false)); +#elif defined(MSCCLPP_DEVICE_CUDA) + // NVIDIA CUDA FP8 addition + __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b))); + return UseClip ? clip(result) : result; +#else + __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b)); + return UseClip ? clip(result) : result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e5m2x2 operator+(const f8_e5m2x2& a, const f8_e5m2x2& b) { +#if defined(MSCCLPP_DEVICE_CUDA) + // CUDA: Convert to half2, add using optimized __hadd2, convert back + f8_e5m2x2 result = + __fp8x2_e5m2(__hadd2(__half2(static_cast<__fp8x2_e5m2>(a)), __half2(static_cast<__fp8x2_e5m2>(b)))); + if constexpr (UseClip) { + result = clip(result); + } + return result; +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // HIP gfx942: Use BF8 assembly instructions + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.data[0].__x, 0)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.data[0].__x, 0))); + return bit_cast( + static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false))); +#else + // Fallback: element-wise using single-element operations + f8_e5m2x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e5m2x4 operator+(const f8_e5m2x4& a, const f8_e5m2x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v_low, v_high; + // E5M2 using bf8 conversion - process low word (false) and high word (true) + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_low) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, false)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, false))); + uint32_t result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, 0, false); + + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_high) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, true)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, true))); + result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, result_packed, true); + return bit_cast(result_packed); +#else + // Process as two f8_e5m2x2 using operator+ for 2 elements + const f8_e5m2x2* a_pair = reinterpret_cast(&a); + const f8_e5m2x2* b_pair = reinterpret_cast(&b); + f8_e5m2x2 result[2]; + result[0] = a_pair[0] + b_pair[0]; + result[1] = a_pair[1] + b_pair[1]; + + return *reinterpret_cast(result); +#endif +} +#endif // defined(__FP8_TYPES_EXIST__) + +MSCCLPP_DEVICE_INLINE u8x4 operator+(const u8x4& a, const u8x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) + // Optimized uint8_t x 4 sum using byte permute to avoid overflow between adjacent bytes + constexpr uint32_t even = 0x00ff00ffu; + uint32_t ua = a.storage; + uint32_t ub = b.storage; + uint32_t x = (ua & even) + (ub & even); + uint32_t y = (ua & ~even) + (ub & ~even); + return __byte_perm(x, y, 0x7250); +#else + return __vadd4(a.storage, b.storage); +#endif +} + +template +MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) { + return (a < b ? a : b); +} + +template <> +MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) { + f32x2 result; + result.data[0] = fminf(a.data[0], b.data[0]); + result.data[1] = fminf(a.data[1], b.data[1]); + return result; +} + +template <> +MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) { +#if defined(MSCCLPP_DEVICE_HIP) + f16x2 val; + val[0] = __hmin(a[0], b[0]); + val[1] = __hmin(a[1], b[1]); + return val; +#else + __half2 ret = __hmin2(a, b); + return ret; +#endif +} + +template <> +MSCCLPP_DEVICE_INLINE bf16x2 min(const bf16x2& a, const bf16x2& b) { + return __hmin2(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) + // Optimized uint8_t x 4 min using 9-bit arithmetic + constexpr uint32_t ones = 0x01010101u; + constexpr uint32_t even = 0x00ff00ffu; // even byte mask + uint32_t ua = a.storage; + uint32_t ub = b.storage; + // Use 9-bit arithmetic to compute d=a-b for each byte + uint32_t d0 = (ua & even) + (~ub & even) + ones; + uint32_t d1 = ((ua >> 8) & even) + (~(ub >> 8) & even) + ones; + // Move sign bit of each 9-bit delta into the least bit of origin byte + uint32_t s = __byte_perm(d0, d1, 0x7351) & ones; + // Broadcast least bit across whole byte + s *= 0xffu; + // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b + return (ua & s) | (ub & ~s); +#else + return __vminu4(a.storage, b.storage); +#endif +} + +/// Convert a vector type From to vector type To. +/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks, +/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached. +/// Specialized below for optimized FP8 conversion paths at x2/x4 level. +template +MSCCLPP_DEVICE_INLINE To to(const From& v) { + static_assert(To::Size == From::Size, "to: vector sizes must match"); + constexpr int N = From::Size; + + // Auto-decompose: N > 4 → split into x4 chunks + if constexpr (N > 4 && N % 4 == 0) { + constexpr int nChunks = N / 4; + using FromChunk = VectorType; + using ToChunk = VectorType; + const FromChunk* in = reinterpret_cast(&v); + To result; + ToChunk* out = reinterpret_cast(&result); +#pragma unroll + for (int c = 0; c < nChunks; ++c) { + out[c] = to(in[c]); + } + return result; + } + // Auto-decompose: N == 4 → split into 2x x2 chunks + else if constexpr (N == 4) { + using FromChunk = VectorType; + using ToChunk = VectorType; + const FromChunk* in = reinterpret_cast(&v); + To result; + ToChunk* out = reinterpret_cast(&result); + out[0] = to(in[0]); + out[1] = to(in[1]); + return result; + } + // Base case: element-wise conversion + else { + To result; +#pragma unroll + for (int i = 0; i < N; ++i) { + result.data[i] = static_cast(v.data[i]); + } + return result; + } +} + +#if defined(__FP8_TYPES_EXIST__) +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) { +#if defined(MSCCLPP_DEVICE_HIP) + return __fp8_e4m3(fminf(float(a), float(b))); +#else + return __fp8_e4m3(__hmin(__half(a), __half(b))); +#endif +} + +MSCCLPP_DEVICE_INLINE f8_e4m3x2 min(const f8_e4m3x2& a, const f8_e4m3x2& b) { + // Process element-wise using single-element operations + f8_e4m3x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e4m3x4 min(const f8_e4m3x4& a, const f8_e4m3x4& b) { + // Process as two f8_e4m3x2 using min for 2 elements + const f8_e4m3x2* a_ptr = reinterpret_cast(&a); + const f8_e4m3x2* b_ptr = reinterpret_cast(&b); + + f8_e4m3x4 result; + f8_e4m3x2* result_ptr = reinterpret_cast(&result); + + result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]); + result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]); + + return result; +} + +template <> +MSCCLPP_DEVICE_INLINE __fp8_e5m2 min(const __fp8_e5m2& a, const __fp8_e5m2& b) { +#if defined(MSCCLPP_DEVICE_HIP) + return __fp8_e5m2(fminf(float(a), float(b))); +#else + return __fp8_e5m2(__hmin(__half(a), __half(b))); +#endif +} + +MSCCLPP_DEVICE_INLINE f8_e5m2x2 min(const f8_e5m2x2& a, const f8_e5m2x2& b) { + // Process element-wise using single-element operations + f8_e5m2x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) { + // Process as two f8_e5m2x2 using min for 2 elements + const f8_e5m2x2* a_ptr = reinterpret_cast(&a); + const f8_e5m2x2* b_ptr = reinterpret_cast(&b); + + f8_e5m2x4 result; + f8_e5m2x2* result_ptr = reinterpret_cast(&result); + + result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]); + result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]); + + return result; +} + +// --- f8_e4m3 -> f32 specializations --- + +/// f8_e4m3x2 -> f32x2. +/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float. +/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8). +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e4m3x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0); + f32x2 result; + result.data[0] = f[0]; + result.data[1] = f[1]; + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3); + f32x2 result; + result.data[0] = __half2float(bit_cast<__half>(h2.x)); + result.data[1] = __half2float(bit_cast<__half>(h2.y)); + return result; +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e4m3x4 -> f32x4. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e4m3x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false); + auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true); + f32x4 result; + result.data[0] = lo[0]; + result.data[1] = lo[1]; + result.data[2] = hi[0]; + result.data[3] = hi[1]; + return result; +#else + const f8_e4m3x2* pair = reinterpret_cast(&v); + f32x2 lo = to(pair[0]); + f32x2 hi = to(pair[1]); + f32x4 result; + result.data[0] = lo.data[0]; + result.data[1] = lo.data[1]; + result.data[2] = hi.data[0]; + result.data[3] = hi.data[1]; + return result; +#endif +} + +// --- f8_e5m2 -> f32 specializations --- + +/// f8_e5m2x2 -> f32x2. +/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float. +/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8). +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e5m2x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0); + f32x2 result; + result.data[0] = f[0]; + result.data[1] = f[1]; + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2); + f32x2 result; + result.data[0] = __half2float(bit_cast<__half>(h2.x)); + result.data[1] = __half2float(bit_cast<__half>(h2.y)); + return result; +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e5m2x4 -> f32x4. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e5m2x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false); + auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true); + f32x4 result; + result.data[0] = lo[0]; + result.data[1] = lo[1]; + result.data[2] = hi[0]; + result.data[3] = hi[1]; + return result; +#else + const f8_e5m2x2* pair = reinterpret_cast(&v); + f32x2 lo = to(pair[0]); + f32x2 hi = to(pair[1]); + f32x4 result; + result.data[0] = lo.data[0]; + result.data[1] = lo.data[1]; + result.data[2] = hi.data[0]; + result.data[3] = hi.data[1]; + return result; +#endif +} + +// --- f32 -> f8_e4m3 specializations (downcast) --- + +/// f32x2 -> f8_e4m3x2. +/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32). +/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2). +/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2; + h2.x = bit_cast(__float2half_rn(v.data[0])); + h2.y = bit_cast(__float2half_rn(v.data[1])); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(__float2half_rn(v.data[0])); + h1.x = bit_cast(__float2half_rn(v.data[1])); + f8_e4m3x2 result; + result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3)); + result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3)); + return result; +#else + f8_e4m3x2 result; + result.data[0] = static_cast<__fp8_e4m3>(v.data[0]); + result.data[1] = static_cast<__fp8_e4m3>(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e4m3x4. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false); + packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true); + return bit_cast(packed); +#else + f32x2 lo, hi; + lo.data[0] = v.data[0]; + lo.data[1] = v.data[1]; + hi.data[0] = v.data[2]; + hi.data[1] = v.data[3]; + f8_e4m3x2 lo_fp8 = to(lo); + f8_e4m3x2 hi_fp8 = to(hi); + f8_e4m3x4 result; + result.data[0] = lo_fp8.data[0]; + result.data[1] = lo_fp8.data[1]; + result.data[2] = hi_fp8.data[0]; + result.data[3] = hi_fp8.data[1]; + return result; +#endif +} + +// --- f32 -> f8_e5m2 specializations (downcast) --- + +/// f32x2 -> f8_e5m2x2. +/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32). +/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2). +/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise). +template <> +MSCCLPP_DEVICE_INLINE f8_e5m2x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2; + h2.x = bit_cast(__float2half_rn(v.data[0])); + h2.y = bit_cast(__float2half_rn(v.data[1])); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(__float2half_rn(v.data[0])); + h1.x = bit_cast(__float2half_rn(v.data[1])); + f8_e5m2x2 result; + result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2)); + result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2)); + return result; +#else + f8_e5m2x2 result; + result.data[0] = static_cast<__fp8_e5m2>(v.data[0]); + result.data[1] = static_cast<__fp8_e5m2>(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e5m2x4. +template <> +MSCCLPP_DEVICE_INLINE f8_e5m2x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false); + packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true); + return bit_cast(packed); +#else + f32x2 lo, hi; + lo.data[0] = v.data[0]; + lo.data[1] = v.data[1]; + hi.data[0] = v.data[2]; + hi.data[1] = v.data[3]; + f8_e5m2x2 lo_fp8 = to(lo); + f8_e5m2x2 hi_fp8 = to(hi); + f8_e5m2x4 result; + result.data[0] = lo_fp8.data[0]; + result.data[1] = lo_fp8.data[1]; + result.data[2] = hi_fp8.data[0]; + result.data[3] = hi_fp8.data[1]; + return result; +#endif +} + +// --- f8_e4m3 <-> f16 conversion specializations --- + +/// f8_e4m3x2 -> f16x2. +/// NVIDIA SM90+: packed intrinsic (1 instruction). +/// HIP gfx942: fp8 -> float -> half (via AMD builtin). +/// Pre-SM90 / fallback: element-wise scalar conversion. +template <> +MSCCLPP_DEVICE_INLINE f16x2 to(const f8_e4m3x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0); + f16x2 result; + result.data[0] = __float2half(f[0]); + result.data[1] = __float2half(f[1]); + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3); + return bit_cast(h2); +#else + f16x2 result; + result.data[0] = static_cast<__half>(v.data[0]); + result.data[1] = static_cast<__half>(v.data[1]); + return result; +#endif +} + +/// f16x2 -> f8_e4m3x2. +/// NVIDIA SM90+: packed intrinsic (1 instruction). +/// HIP gfx942: half -> float -> fp8 (via AMD builtin). +/// Pre-SM90: element-wise scalar conversion. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x2 to(const f16x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float f0 = __half2float(v.data[0]); + float f1 = __half2float(v.data[1]); + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = bit_cast<__half2_raw>(v); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(v.data[0]); + h1.x = bit_cast(v.data[1]); + f8_e4m3x2 result; + result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3)); + result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3)); + return result; +#else + f8_e4m3x2 result; + result.data[0] = static_cast<__fp8_e4m3>(v.data[0]); + result.data[1] = static_cast<__fp8_e4m3>(v.data[1]); + return result; +#endif +} + +#endif // defined(__FP8_TYPES_EXIST__) + +// --- fp8_e4m3b15 <-> fp16 direct conversion specializations --- +// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift +// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits +// vs fp8's 3. fp32 conversions are derived by routing through fp16. + +/// f8_e4m3b15x2 -> f16x2. +/// Direct fp8 -> fp16 via branch-free bit manipulation. +template <> +MSCCLPP_DEVICE_INLINE f16x2 to(const f8_e4m3b15x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint16_t in = v.storage.__x; + // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5. + uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24); + uint32_t b0 = (a0 & 0x7f007f00u) >> 1; + uint32_t out0 = b0 | (a0 & 0x80008000u); + __half2 h; + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h)) : "r"(out0)); + return h; +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // gfx942: same bit manipulation as CUDA, store packed fp16 bits via words[]. + uint16_t in = v.storage.__x; + uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24); + uint32_t b0 = (a0 & 0x7f007f00u) >> 1; + uint32_t out0 = b0 | (a0 & 0x80008000u); + f16x2 result; + result.words[0] = out0; + return result; +#else + f16x2 result; + result.data[0] = __float2half(float(v.data[0])); + result.data[1] = __float2half(float(v.data[1])); + return result; +#endif +} + +/// f8_e4m3b15x4 -> f16x4. +/// Uses __byte_perm + lop3 for branch-free vectorized conversion. +template <> +MSCCLPP_DEVICE_INLINE f16x4 to(const f8_e4m3b15x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in = v.storage.__x; + uint32_t a0 = __byte_perm(0u, in, 0x5746u); + uint32_t a0_shr = a0 >> 1; + uint32_t a0_sign = a0 & 0x80008000u; + uint32_t out0; + asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign)); + uint32_t a1 = __byte_perm(a0, 0u, 0x2301u); + uint32_t a1_shr = a1 >> 1; + uint32_t a1_sign = a1 & 0x80008000u; + uint32_t out1; + asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign)); + f16x4 result; + asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0)); + asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1)); + return result; +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // gfx942: __byte_perm + bitwise E4→E5 shift (no lop3), store via words[]. + uint32_t in = v.storage.__x; + uint32_t a0 = __byte_perm(0u, in, 0x5746u); + uint32_t out0 = ((a0 >> 1) & 0x3f803f80u) | (a0 & 0x80008000u); + uint32_t a1 = __byte_perm(a0, 0u, 0x2301u); + uint32_t out1 = ((a1 >> 1) & 0x3f803f80u) | (a1 & 0x80008000u); + f16x4 result; + result.words[0] = out0; + result.words[1] = out1; + return result; +#else + f16x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __float2half(float(v.data[i])); + } + return result; +#endif +} + +/// f16x2 -> f8_e4m3b15x2. +/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f16x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in0; + asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast(&v))); + // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16). + uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16; + uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu; + alo = alo < 0x3B80u ? alo : 0x3B80u; + ahi = ahi < 0x3B80u ? ahi : 0x3B80u; + uint32_t a0 = alo | (ahi << 16); + a0 = a0 * 2u + 0x00800080u; + uint32_t b0 = a0 | (in0 & 0x80008000u); + uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u)); + return bit_cast(packed); +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, pack. + uint32_t in0 = v.words[0]; + uint32_t abs0 = in0 & 0x7fff7fffu; + uint32_t a0; + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u)); + a0 = a0 * 2u + 0x00800080u; + uint32_t b0 = a0 | (in0 & 0x80008000u); + uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u)); + return bit_cast(packed); +#else + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(__half2float(v.data[0])); + result.data[1] = __fp8_e4m3b15(__half2float(v.data[1])); + return result; +#endif +} + +/// f16x4 -> f8_e4m3b15x4. +/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f16x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in0, in1; + asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0])); + asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1])); + uint32_t abs0 = in0 & 0x7fff7fffu; + uint32_t abs1 = in1 & 0x7fff7fffu; + uint32_t a0 = __vminu2(abs0, 0x3B803B80u); + uint32_t a1 = __vminu2(abs1, 0x3B803B80u); + a0 = a0 * 2u + 0x00800080u; + a1 = a1 * 2u + 0x00800080u; + uint32_t b0, b1; + asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u)); + asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u)); + uint32_t packed = __byte_perm(b0, b1, 0x7531u); + return bit_cast(packed); +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // gfx942: read packed fp16 bits, clamp via v_pk_min_u16, shift E5→E4, __byte_perm pack. + uint32_t in0 = v.words[0], in1 = v.words[1]; + uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu; + uint32_t a0, a1; + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u)); + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u)); + a0 = a0 * 2u + 0x00800080u; + a1 = a1 * 2u + 0x00800080u; + uint32_t b0 = a0 | (in0 & 0x80008000u); + uint32_t b1 = a1 | (in1 & 0x80008000u); + uint32_t packed = __byte_perm(b0, b1, 0x7531u); + return bit_cast(packed); +#else + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(__half2float(v.data[i])); + } + return result; +#endif +} + +// --- fp8_e4m3b15 <-> f32 conversion specializations (software, always available) --- + +/// f8_e4m3b15x2 -> f32x2. +/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32. +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e4m3b15x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + f16x2 h = to(v); + float2 f2 = __half22float2(h); + return bit_cast(f2); +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + f16x2 h = to(v); + f32x2 result; + result.data[0] = __half2float(h.data[0]); + result.data[1] = __half2float(h.data[1]); + return result; +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e4m3b15x4 -> f32x4. +/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e4m3b15x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + f16x4 h = to(v); + __half2 h0, h1; + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h0)) : "r"(h.words[0])); + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h1)) : "r"(h.words[1])); + float2 f0 = __half22float2(h0); + float2 f1 = __half22float2(h1); + f32x4 result; + result.data[0] = f0.x; + result.data[1] = f0.y; + result.data[2] = f1.x; + result.data[3] = f1.y; + return result; +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + f16x4 h = to(v); + f32x4 result; + result.data[0] = __half2float(h.data[0]); + result.data[1] = __half2float(h.data[1]); + result.data[2] = __half2float(h.data[2]); + result.data[3] = __half2float(h.data[3]); + return result; +#else + f32x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = float(v.data[i]); + } + return result; +#endif +} + +/// f32x2 -> f8_e4m3b15x2. +/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + float2 f2 = {v.data[0], v.data[1]}; + __half2 h = __float22half2_rn(f2); + return to(h); +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + f16x2 h; + h.data[0] = __float2half_rn(v.data[0]); + h.data[1] = __float2half_rn(v.data[1]); + return to(h); +#else + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(v.data[0]); + result.data[1] = __fp8_e4m3b15(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e4m3b15x4. +/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + float2 f01 = {v.data[0], v.data[1]}; + float2 f23 = {v.data[2], v.data[3]}; + __half2 h01 = __float22half2_rn(f01); + __half2 h23 = __float22half2_rn(f23); + f16x4 h; + asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast(&h01))); + asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast(&h23))); + return to(h); +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + f16x4 h; + h.words[0] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[0], v.data[1])); + h.words[1] = __builtin_bit_cast(uint32_t, __builtin_amdgcn_cvt_pkrtz(v.data[2], v.data[3])); + return to(h); +#else + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(v.data[i]); + } + return result; +#endif +} + +// --- fp8_e4m3b15 arithmetic (software, always available) --- + +template +MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) { + return __fp8_e4m3b15(float(a) + float(b)); +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) { + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0])); + result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1])); + return result; +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) { + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i])); + } + return result; +} + +// --- fp8_e4m3b15 min (software) --- + +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) { + return __fp8_e4m3b15(fminf(float(a), float(b))); +} + +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) { + f8_e4m3b15x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) { + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = mscclpp::min(a.data[i], b.data[i]); + } + return result; +} + +#endif // MSCCLPP_DEVICE_COMPILE } // namespace mscclpp #endif // MSCCLPP_GPU_DATA_TYPES_HPP_ diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp index 36a56a90..990deabb 100644 --- a/include/mscclpp/proxy.hpp +++ b/include/mscclpp/proxy.hpp @@ -29,7 +29,9 @@ class Proxy { public: /// Constructor. /// @param handler Handler for each FIFO trigger. - /// @param threadInit Optional function run in proxy thread before FIFO consumption. + /// @param threadInit Optional function run once in the proxy thread before FIFO consumption. + /// The function should initialize thread runtime context before any CUDA API call in that thread + /// (for example, set CUDA device and optionally bind NUMA affinity). /// @param fifoSize FIFO size (default: DEFAULT_FIFO_SIZE). Proxy(ProxyHandler handler, std::function threadInit, int fifoSize = DEFAULT_FIFO_SIZE); diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp index 27f9aefa..4d1f2e32 100644 --- a/include/mscclpp/semaphore.hpp +++ b/include/mscclpp/semaphore.hpp @@ -16,6 +16,7 @@ namespace mscclpp { class Host2DeviceSemaphore { private: Semaphore semaphore_; + std::shared_ptr inboundToken_; detail::UniqueGpuPtr expectedInboundToken_; std::unique_ptr outboundToken_; @@ -29,6 +30,15 @@ class Host2DeviceSemaphore { /// @param connection The connection associated with this semaphore. Host2DeviceSemaphore(Communicator& communicator, const Connection& connection); + /// Destructor. + ~Host2DeviceSemaphore(); + + /// Move constructor. + Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default; + + /// Move assignment operator. + Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default; + /// Returns the connection. /// @return The connection associated with this semaphore. Connection& connection(); @@ -82,7 +92,6 @@ class MemoryDevice2DeviceSemaphore { private: Semaphore semaphore_; detail::UniqueGpuPtr expectedInboundToken_; - detail::UniqueGpuPtr outboundToken_; public: /// Constructor. diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index f1b01e89..a790a6e1 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { /// Signal remote device, ensures prior memory ops complete. MSCCLPP_DEVICE_INLINE void signal() { - auto outbound = incOutbound(); -#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800) - // Using memoryOrderSeqCst is faster for A100. - atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst); -#else - atomicStore(remoteInboundToken, outbound, memoryOrderRelease); +#if defined(MSCCLPP_DEVICE_CUDA) + asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory"); +#elif defined(MSCCLPP_DEVICE_HIP) + (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease); #endif } /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data. MSCCLPP_DEVICE_INLINE void relaxedSignal() { - auto outbound = incOutbound(); - atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed); +#if defined(MSCCLPP_DEVICE_CUDA) + asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory"); +#elif defined(MSCCLPP_DEVICE_HIP) + (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed); +#endif } /// Thread-safe read of expected inbound value. @@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { return atomicLoad(inboundToken, memoryOrderRelaxed); } - /// Thread-safe read of outbound value. - /// @return The outbound value. - MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() { - return atomicLoad(outboundToken, memoryOrderRelaxed); - } - - /// Thread-safe increment of outbound value. - /// @return The incremented outbound value. - MSCCLPP_DEVICE_INLINE uint64_t incOutbound() { - return atomicFetchAdd(outboundToken, 1, memoryOrderRelaxed) + 1; - } #endif // defined(MSCCLPP_DEVICE_COMPILE) /// A local memory space where the remote device will write its semaphore value and the local device will read it. uint64_t* inboundToken; - /// A local memory space where the local device stores the semaphore value to be written to the remote device. - uint64_t* outboundToken; - - /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the - /// remote device. + /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device. uint64_t* remoteInboundToken; /// A local memory space where the local device stores the expected value of the inboundToken to wait for. diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index 5f8a1608..b52b6572 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle { : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" : "=r"(val.words[0]), "=r"(val.words[1]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" : "=r"(val.words[0]), "=r"(val.words[1]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) @@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b84cea3a..5e784e92 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -4,6 +4,10 @@ add_subdirectory(csrc) add_subdirectory(test) +target_compile_definitions(mscclpp_py PRIVATE + $<$:MSCCLPP_DISABLE_NB_LEAK_WARNINGS> +) + add_custom_target(pytest_lib_copy ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so @@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp DEPENDS mscclpp_py mscclpp_py_test -) +) \ No newline at end of file diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt index 8759201f..44fb150f 100644 --- a/python/csrc/CMakeLists.txt +++ b/python/csrc/CMakeLists.txt @@ -24,4 +24,7 @@ set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp) set_target_properties(mscclpp_py PROPERTIES INSTALL_RPATH "\$ORIGIN/lib") target_link_libraries(mscclpp_py PRIVATE dlpack mscclpp mscclpp_collectives ${GPU_LIBRARIES}) target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) +if(MSCCLPP_USE_ROCM) + target_compile_definitions(mscclpp_py PRIVATE MSCCLPP_USE_ROCM) +endif() install(TARGETS mscclpp_py LIBRARY DESTINATION .) diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp index 5a9c4bd6..a9aa2727 100644 --- a/python/csrc/algorithm.cpp +++ b/python/csrc/algorithm.cpp @@ -16,14 +16,16 @@ namespace nb = nanobind; using namespace mscclpp; void register_algorithm(nb::module_& m) { - nb::enum_(m, "CollectiveBufferMode") + nb::enum_(m, "CppCollectiveBufferMode") .value("ANY", CollectiveBufferMode::Any) .value("IN_PLACE", CollectiveBufferMode::InPlace) .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace); - nb::enum_(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL); + nb::enum_(m, "CppAlgorithmType") + .value("NATIVE", AlgorithmType::Native) + .value("DSL", AlgorithmType::DSL); - nb::enum_(m, "CommResult") + nb::enum_(m, "CppCommResult") .value("COMM_SUCCESS", CommResult::CommSuccess) .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError) .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError) @@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) { .value("COMM_IN_PROGRESS", CommResult::CommInProgress) .value("COMM_NUM_RESULTS", CommResult::CommNumResults); - nb::enum_(m, "ReduceOp") + nb::enum_(m, "CppReduceOp") .value("SUM", ReduceOp::SUM) .value("MIN", ReduceOp::MIN) .value("NOP", ReduceOp::NOP); auto algorithmClass = - nb::class_(m, "Algorithm") + nb::class_(m, "CppAlgorithm") .def_static( "from_native_capsule", [](nb::capsule cap) { @@ -58,6 +60,12 @@ void register_algorithm(nb::module_& m) { .def_prop_ro("name", &Algorithm::name) .def_prop_ro("collective", &Algorithm::collective) .def_prop_ro("message_range", &Algorithm::messageRange) + .def( + "set_message_size_range", + [](Algorithm& self, size_t minMessageSize, size_t maxMessageSize) { + self.setMessageSizeRange(minMessageSize, maxMessageSize); + }, + nb::arg("min_message_size"), nb::arg("max_message_size")) .def_prop_ro("tags", &Algorithm::tags) .def_prop_ro("buffer_mode", &Algorithm::bufferMode) .def_prop_ro("constraint", &Algorithm::constraint) @@ -67,16 +75,19 @@ void register_algorithm(nb::module_& m) { "execute", [](Algorithm& self, std::shared_ptr comm, uintptr_t input, uintptr_t output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream, - std::shared_ptr executor, int nBlocks, int nThreadsPerBlock, - std::unordered_map extras) { + std::shared_ptr executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory, + std::unordered_map extras, int32_t accumDtype) { return self.execute(comm, reinterpret_cast(input), reinterpret_cast(output), inputSize, outputSize, dtype, op, reinterpret_cast(stream), executor, - nBlocks, nThreadsPerBlock, extras); + nBlocks, nThreadsPerBlock, symmetricMemory, extras, + static_cast(accumDtype)); }, nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"), nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr, - nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, - nb::arg("extras") = std::unordered_map()); + nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false, + nb::arg("extras") = std::unordered_map(), + nb::arg("accum_dtype") = static_cast(DataType::AUTO)) + .def("reset", &Algorithm::reset); nb::class_(algorithmClass, "Constraint") .def(nb::init<>()) @@ -84,21 +95,21 @@ void register_algorithm(nb::module_& m) { .def_rw("world_size", &Algorithm::Constraint::worldSize) .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode); - nb::class_(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build); + nb::class_(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build); - nb::class_(m, "DslAlgorithm") + nb::class_(m, "CppDslAlgorithm") .def(nb::init, Algorithm::Constraint>(), nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map(), nb::arg("constraint") = Algorithm::Constraint()) .def("build", &DslAlgorithm::build); - nb::class_(m, "AlgorithmCollection") + nb::class_(m, "CppAlgorithmCollection") .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"), nb::arg("algorithm")) .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective")) .def("to_list", &AlgorithmCollection::getAllAlgorithms); - nb::class_(m, "CollectiveRequest") + nb::class_(m, "CppCollectiveRequest") .def_ro("world_size", &CollectiveRequest::worldSize) .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode) .def_ro("rank", &CollectiveRequest::rank) @@ -107,8 +118,22 @@ void register_algorithm(nb::module_& m) { .def_prop_ro("output_buffer", [](const CollectiveRequest& self) { return reinterpret_cast(self.outputBuffer); }) .def_ro("message_size", &CollectiveRequest::messageSize) + .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast(self.stream); }) .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; }) .def_ro("dtype", &CollectiveRequest::dtype) .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; }) .def("buffer_mode", &CollectiveRequest::bufferMode); + + m.def( + "cpp_get_flag_buffer", + []() { + auto [buffer, size] = getFlagBuffer(); + uintptr_t ptr = reinterpret_cast(buffer.get()); + // Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime. + auto prevent = std::make_unique>(std::move(buffer)); + nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast*>(p); }); + prevent.release(); // capsule now owns the pointer + return nb::make_tuple(ptr, size, owner); + }, + "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner)."); } \ No newline at end of file diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index c1462a11..ec64d744 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -32,21 +32,25 @@ extern void register_algorithm_collection_builder(nb::module_& m); template void def_shared_future(nb::handle& m, const std::string& typestr) { - std::string pyclass_name = std::string("shared_future_") + typestr; + std::string pyclass_name = std::string("CppSharedFuture_") + typestr; nb::class_>(m, pyclass_name.c_str()).def("get", &std::shared_future::get); } void register_core(nb::module_& m) { m.def("version", &version); - nb::enum_(m, "DataType") + nb::enum_(m, "CppDataType") .value("int32", DataType::INT32) .value("uint32", DataType::UINT32) .value("float16", DataType::FLOAT16) .value("float32", DataType::FLOAT32) - .value("bfloat16", DataType::BFLOAT16); + .value("bfloat16", DataType::BFLOAT16) + .value("float8_e4m3", DataType::FLOAT8_E4M3) + .value("float8_e5m2", DataType::FLOAT8_E5M2) + .value("uint8", DataType::UINT8) + .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15); - nb::class_(m, "Bootstrap") + nb::class_(m, "CppBootstrap") .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode) @@ -71,7 +75,7 @@ void register_core(nb::module_& m) { .def("recv", static_cast&, int, int)>(&Bootstrap::recv), nb::arg("data"), nb::arg("peer"), nb::arg("tag")); - nb::class_(m, "UniqueId") + nb::class_(m, "CppUniqueId") .def(nb::init<>()) .def("__setstate__", [](UniqueId& self, nb::bytes b) { @@ -81,7 +85,7 @@ void register_core(nb::module_& m) { .def("__getstate__", [](const UniqueId& self) { return nb::bytes(reinterpret_cast(self.data()), UniqueIdBytes); }); - nb::class_(m, "TcpBootstrap") + nb::class_(m, "CppTcpBootstrap") .def(nb::init(), "Do not use this constructor. Use create instead.") .def_static( "create", [](int rank, int nRanks) { return std::make_shared(rank, nRanks); }, nb::arg("rank"), @@ -93,7 +97,7 @@ void register_core(nb::module_& m) { .def("initialize", static_cast(&TcpBootstrap::initialize), nb::call_guard(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30); - nb::enum_(m, "Transport") + nb::enum_(m, "CppTransport") .value("Unknown", Transport::Unknown) .value("CudaIpc", Transport::CudaIpc) .value("IB0", Transport::IB0) @@ -106,7 +110,7 @@ void register_core(nb::module_& m) { .value("IB7", Transport::IB7) .value("NumTransports", Transport::NumTransports); - nb::class_(m, "TransportFlags") + nb::class_(m, "CppTransportFlags") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("transport")) .def("has", &TransportFlags::has, nb::arg("transport")) @@ -130,12 +134,12 @@ void register_core(nb::module_& m) { .def(nb::self == nb::self) .def(nb::self != nb::self); - nb::enum_(m, "DeviceType") + nb::enum_(m, "CppDeviceType") .value("Unknown", DeviceType::Unknown) .value("CPU", DeviceType::CPU) .value("GPU", DeviceType::GPU); - nb::class_(m, "Device") + nb::class_(m, "CppDevice") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("type")) .def(nb::init(), nb::arg("type"), nb::arg("id") = -1) @@ -147,24 +151,33 @@ void register_core(nb::module_& m) { return ss.str(); }); - nb::class_(m, "EndpointConfigIb") + nb::enum_(m, "CppIbMode") + .value("Default", EndpointConfig::Ib::Mode::Default) + .value("Host", EndpointConfig::Ib::Mode::Host) + .value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic); + + nb::class_(m, "CppEndpointConfigIb") .def(nb::init<>()) - .def(nb::init(), nb::arg("device_index") = -1, + .def(nb::init(), nb::arg("device_index") = -1, nb::arg("port") = EndpointConfig::Ib::DefaultPort, nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex, nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize, nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum, nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr, - nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend) + nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr, + nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend, + nb::arg("mode") = EndpointConfig::Ib::Mode::Default) .def_rw("device_index", &EndpointConfig::Ib::deviceIndex) .def_rw("port", &EndpointConfig::Ib::port) .def_rw("gid_index", &EndpointConfig::Ib::gidIndex) .def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize) .def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum) .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr) - .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend); + .def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr) + .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend) + .def_rw("mode", &EndpointConfig::Ib::mode); - nb::class_(m, "RegisteredMemory") + nb::class_(m, "CppRegisteredMemory") .def(nb::init<>()) .def("data", [](RegisteredMemory& self) { return reinterpret_cast(self.data()); }) .def("size", &RegisteredMemory::size) @@ -172,7 +185,7 @@ void register_core(nb::module_& m) { .def("serialize", &RegisteredMemory::serialize) .def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data")); - nb::class_(m, "Endpoint") + nb::class_(m, "CppEndpoint") .def("config", &Endpoint::config) .def("transport", &Endpoint::transport) .def("device", &Endpoint::device) @@ -180,7 +193,7 @@ void register_core(nb::module_& m) { .def("serialize", &Endpoint::serialize) .def_static("deserialize", &Endpoint::deserialize, nb::arg("data")); - nb::class_(m, "Connection") + nb::class_(m, "CppConnection") .def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"), nb::arg("size")) .def( @@ -197,7 +210,7 @@ void register_core(nb::module_& m) { .def("local_device", &Connection::localDevice) .def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize); - nb::class_(m, "EndpointConfig") + nb::class_(m, "CppEndpointConfig") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("transport")) .def(nb::init(), nb::arg("transport"), nb::arg("device"), @@ -223,12 +236,18 @@ void register_core(nb::module_& m) { .def_prop_rw( "ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; }, [](EndpointConfig& self, int v) { self.ib.maxSendWr = v; }) + .def_prop_rw( + "ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; }, + [](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; }) .def_prop_rw( "ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; }, [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; }) + .def_prop_rw( + "ib_mode", [](EndpointConfig& self) { return self.ib.mode; }, + [](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; }) .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize); - nb::class_(m, "Context") + nb::class_(m, "CppContext") .def_static("create", &Context::create) .def( "register_memory", @@ -239,13 +258,13 @@ void register_core(nb::module_& m) { .def("create_endpoint", &Context::createEndpoint, nb::arg("config")) .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint")); - nb::class_(m, "SemaphoreStub") + nb::class_(m, "CppSemaphoreStub") .def(nb::init(), nb::arg("connection")) .def("memory", &SemaphoreStub::memory) .def("serialize", &SemaphoreStub::serialize) .def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data")); - nb::class_(m, "Semaphore") + nb::class_(m, "CppSemaphore") .def(nb::init<>()) .def(nb::init(), nb::arg("local_stub"), nb::arg("remote_stub")) .def("connection", &Semaphore::connection) @@ -256,7 +275,7 @@ void register_core(nb::module_& m) { def_shared_future(m, "Connection"); def_shared_future(m, "Semaphore"); - nb::class_(m, "Communicator") + nb::class_(m, "CppCommunicator") .def(nb::init, std::shared_ptr>(), nb::arg("bootstrap"), nb::arg("context") = nullptr) .def("bootstrap", &Communicator::bootstrap) @@ -289,6 +308,9 @@ void register_core(nb::module_& m) { } NB_MODULE(_mscclpp, m) { +#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS + nb::set_leak_warnings(false); +#endif register_env(m); register_error(m); register_port_channel(m); @@ -306,4 +328,4 @@ NB_MODULE(_mscclpp, m) { // ext register_algorithm_collection_builder(m); -} +} \ No newline at end of file diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp index a0ba4a4e..d4b2f5da 100644 --- a/python/csrc/env_py.cpp +++ b/python/csrc/env_py.cpp @@ -11,7 +11,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_env(nb::module_& m) { - nb::class_(m, "Env") + nb::class_(m, "CppEnv") .def_ro("debug", &Env::debug) .def_ro("debug_subsys", &Env::debugSubsys) .def_ro("debug_file", &Env::debugFile) @@ -20,9 +20,11 @@ void register_env(nb::module_& m) { .def_ro("socket_family", &Env::socketFamily) .def_ro("socket_ifname", &Env::socketIfname) .def_ro("comm_id", &Env::commId) - .def_ro("execution_plan_dir", &Env::executionPlanDir) + .def_ro("ibv_mode", &Env::ibvMode) + .def_ro("cache_dir", &Env::cacheDir) .def_ro("npkit_dump_dir", &Env::npkitDumpDir) - .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); + .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream) + .def_ro("ib_gid_index", &Env::ibGidIndex); m.def("env", &env); } diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp index ff532d10..c19a3b15 100644 --- a/python/csrc/error_py.cpp +++ b/python/csrc/error_py.cpp @@ -11,18 +11,18 @@ using namespace mscclpp; #define REGISTER_EXCEPTION_TRANSLATOR(name_) \ nb::register_exception_translator( \ - [](const std::exception_ptr &p, void *payload) { \ + [](const std::exception_ptr& p, void* payload) { \ try { \ std::rethrow_exception(p); \ - } catch (const name_ &e) { \ - PyErr_SetObject(reinterpret_cast(payload), \ + } catch (const name_& e) { \ + PyErr_SetObject(reinterpret_cast(payload), \ PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \ } \ }, \ m.attr(#name_).ptr()); -void register_error(nb::module_ &m) { - nb::enum_(m, "ErrorCode") +void register_error(nb::module_& m) { + nb::enum_(m, "CppErrorCode") .value("SystemError", ErrorCode::SystemError) .value("InternalError", ErrorCode::InternalError) .value("RemoteError", ErrorCode::RemoteError) diff --git a/python/csrc/executor_py.cpp b/python/csrc/executor_py.cpp index 0a196f37..350a1e7a 100644 --- a/python/csrc/executor_py.cpp +++ b/python/csrc/executor_py.cpp @@ -15,16 +15,16 @@ namespace nb = nanobind; using namespace mscclpp; void register_executor(nb::module_& m) { - nb::enum_(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); + nb::enum_(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); - nb::class_(m, "ExecutionPlan") + nb::class_(m, "CppExecutionPlan") .def(nb::init(), nb::arg("planPath"), nb::arg("rank")) .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); }) .def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); }) .def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); }) .def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); }); - nb::class_(m, "Executor") + nb::class_(m, "CppExecutor") .def(nb::init>(), nb::arg("comm")) .def( "execute", diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp index 2756edb7..4a3563d9 100644 --- a/python/csrc/ext/algorithm_collection_builder_py.cpp +++ b/python/csrc/ext/algorithm_collection_builder_py.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -15,7 +16,7 @@ using namespace mscclpp; using namespace mscclpp::collective; void register_algorithm_collection_builder(nb::module_& m) { - nb::class_(m, "AlgorithmCollectionBuilder") + nb::class_(m, "CppAlgorithmCollectionBuilder") .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance) .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder")) .def( @@ -29,6 +30,6 @@ void register_algorithm_collection_builder(nb::module_& m) { nb::arg("selector")) .def("build", &AlgorithmCollectionBuilder::build) .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"), - nb::arg("scratch_buffer_size"), nb::arg("rank")) + nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank")) .def_static("reset", &AlgorithmCollectionBuilder::reset); } \ No newline at end of file diff --git a/python/csrc/fifo_py.cpp b/python/csrc/fifo_py.cpp index 63be4a33..e8b6a3e2 100644 --- a/python/csrc/fifo_py.cpp +++ b/python/csrc/fifo_py.cpp @@ -9,7 +9,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_fifo(nb::module_& m) { - nb::class_(m, "ProxyTrigger") + nb::class_(m, "CppProxyTrigger") .def_prop_rw( "fst", [](const ProxyTrigger& self) { return self.fst; }, [](ProxyTrigger& self, uint64_t v) { self.fst = v; }) @@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) { "snd", [](const ProxyTrigger& self) { return self.snd; }, [](ProxyTrigger& self, uint64_t v) { self.snd = v; }); - nb::class_(m, "FifoDeviceHandle") + nb::class_(m, "CppFifoDeviceHandle") .def_rw("triggers", &FifoDeviceHandle::triggers) .def_rw("tail", &FifoDeviceHandle::tail) .def_rw("head", &FifoDeviceHandle::head) @@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "Fifo") + nb::class_(m, "CppFifo") .def(nb::init(), nb::arg("size") = DEFAULT_FIFO_SIZE) .def("poll", &Fifo::poll) .def("pop", &Fifo::pop) diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp index 66f036e2..60880456 100644 --- a/python/csrc/gpu_utils_py.cpp +++ b/python/csrc/gpu_utils_py.cpp @@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) { return DLDataType{kDLBfloat, 16, 1}; } else if (type == "torch.float16") { return DLDataType{kDLFloat, 16, 1}; + } else if (type == "torch.float8_e4m3fn") { + return DLDataType{kDLFloat8_e4m3fn, 8, 1}; + } else if (type == "torch.float8_e4m3fnuz") { + return DLDataType{kDLFloat8_e4m3fnuz, 8, 1}; + } else if (type == "torch.float8_e5m2") { + return DLDataType{kDLFloat8_e5m2, 8, 1}; + } else if (type == "torch.float8_e5m2fnuz") { + return DLDataType{kDLFloat8_e5m2fnuz, 8, 1}; + } else if (type == "torch.uint8") { + return DLDataType{kDLUInt, 8, 1}; + } else if (type == "fp8_e4m3b15") { + // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes. + return DLDataType{kDLUInt, 8, 1}; } else { throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage); } @@ -101,7 +114,7 @@ static nb::capsule toDlpack(GpuBuffer buffer, std::string dataType, std::v void register_gpu_utils(nb::module_& m) { m.def("is_nvls_supported", &isNvlsSupported); - nb::class_>(m, "RawGpuBuffer") + nb::class_>(m, "CppRawGpuBuffer") .def(nb::init(), nb::arg("nelems")) .def("nelems", &GpuBuffer::nelems) .def("bytes", &GpuBuffer::bytes) diff --git a/python/csrc/memory_channel_py.cpp b/python/csrc/memory_channel_py.cpp index 4f9d90a0..ecccb1a0 100644 --- a/python/csrc/memory_channel_py.cpp +++ b/python/csrc/memory_channel_py.cpp @@ -11,20 +11,20 @@ namespace nb = nanobind; using namespace mscclpp; void register_memory_channel(nb::module_& m) { - nb::class_(m, "BaseMemoryChannel") + nb::class_(m, "CppBaseMemoryChannel") .def(nb::init<>()) .def(nb::init>(), nb::arg("semaphore")) .def(nb::init(), nb::arg("semaphore")) .def("device_handle", &BaseMemoryChannel::deviceHandle); - nb::class_(m, "BaseMemoryChannelDeviceHandle") + nb::class_(m, "CppBaseMemoryChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_) .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "MemoryChannel") + nb::class_(m, "CppMemoryChannel") .def(nb::init<>()) .def( "__init__", @@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) { nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0) .def("device_handle", &MemoryChannel::deviceHandle); - nb::class_(m, "MemoryChannelDeviceHandle") + nb::class_(m, "CppMemoryChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_) .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_) diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp index 0557b72d..8c158354 100644 --- a/python/csrc/npkit_py.cpp +++ b/python/csrc/npkit_py.cpp @@ -8,8 +8,8 @@ namespace nb = nanobind; -void register_npkit(nb::module_ &m) { - nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions"); +void register_npkit(nb::module_& m) { + nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions"); sub_m.def("init", &NpKit::Init); sub_m.def("dump", &NpKit::Dump); sub_m.def("shutdown", &NpKit::Shutdown); diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp index 2489a479..fadc0f69 100644 --- a/python/csrc/numa_py.cpp +++ b/python/csrc/numa_py.cpp @@ -6,8 +6,8 @@ int getDeviceNumaNode(int cudaDev); void numaBind(int node); }; // namespace mscclpp -void register_numa(nb::module_ &m) { - nb::module_ sub_m = m.def_submodule("numa", "numa functions"); +void register_numa(nb::module_& m) { + nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions"); sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode); sub_m.def("numa_bind", &mscclpp::numaBind); } diff --git a/python/csrc/port_channel_py.cpp b/python/csrc/port_channel_py.cpp index 4b1aa289..e3dd98f1 100644 --- a/python/csrc/port_channel_py.cpp +++ b/python/csrc/port_channel_py.cpp @@ -11,11 +11,11 @@ namespace nb = nanobind; using namespace mscclpp; void register_port_channel(nb::module_& m) { - nb::class_(m, "BaseProxyService") + nb::class_(m, "CppBaseProxyService") .def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false) .def("stop_proxy", &BaseProxyService::stopProxy); - nb::class_(m, "ProxyService") + nb::class_(m, "CppProxyService") .def(nb::init(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE) .def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false) .def("stop_proxy", &ProxyService::stopProxy) @@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) { .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id")) .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src")); - nb::class_(m, "BasePortChannel") + nb::class_(m, "CppBasePortChannel") .def(nb::init<>()) .def(nb::init, std::shared_ptr>(), nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy")) .def("device_handle", &BasePortChannel::deviceHandle); - nb::class_(m, "BasePortChannelDeviceHandle") + nb::class_(m, "CppBasePortChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_) .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_) @@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "PortChannel") + nb::class_(m, "CppPortChannel") .def(nb::init<>()) .def(nb::init, std::shared_ptr, MemoryId, MemoryId>(), nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src")) .def("device_handle", &PortChannel::deviceHandle); - nb::class_(m, "PortChannelDeviceHandle") + nb::class_(m, "CppPortChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_) .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_) diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp index 665d395e..17c06a7d 100644 --- a/python/csrc/semaphore_py.cpp +++ b/python/csrc/semaphore_py.cpp @@ -10,7 +10,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_semaphore(nb::module_& m) { - nb::class_ host2DeviceSemaphore(m, "Host2DeviceSemaphore"); + nb::class_ host2DeviceSemaphore(m, "CppHost2DeviceSemaphore"); host2DeviceSemaphore.def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &Host2DeviceSemaphore::connection) @@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "Host2HostSemaphore") + nb::class_(m, "CppHost2HostSemaphore") .def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &Host2HostSemaphore::connection) @@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) { .def("wait", &Host2HostSemaphore::wait, nb::call_guard(), nb::arg("max_spin_count") = 10000000); - nb::class_ memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore"); + nb::class_ memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore"); memoryDevice2DeviceSemaphore.def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &MemoryDevice2DeviceSemaphore::connection) @@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) { nb::class_(memoryDevice2DeviceSemaphore, "DeviceHandle") .def(nb::init<>()) .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken) - .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken) .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken) .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken) .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { diff --git a/python/csrc/switch_channel_py.cpp b/python/csrc/switch_channel_py.cpp index dd72c97e..2d0340dd 100644 --- a/python/csrc/switch_channel_py.cpp +++ b/python/csrc/switch_channel_py.cpp @@ -15,11 +15,11 @@ namespace nb = nanobind; using namespace mscclpp; void register_nvls(nb::module_& m) { - nb::class_(m, "SwitchChannel") + nb::class_(m, "CppSwitchChannel") .def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); }) .def("device_handle", &SwitchChannel::deviceHandle); - nb::class_(m, "DeviceHandle") + nb::class_(m, "CppSwitchChannelDeviceHandle") .def(nb::init<>()) .def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr) .def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr) @@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "NvlsConnection") + nb::class_(m, "CppNvlsConnection") .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size")); m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"), diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 58233a7c..5f3a2302 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -23,35 +23,37 @@ version = { from ._core import * from ._mscclpp import ( - Device, - DeviceType, - Communicator, - Connection, + CppDevice as Device, + CppDeviceType as DeviceType, + CppCommunicator as Communicator, + CppConnection as Connection, connect_nvls_collective, - EndpointConfig, - Fifo, - Semaphore, - Host2DeviceSemaphore, - Host2HostSemaphore, - numa, - ProxyService, - RegisteredMemory, - PortChannel, - MemoryChannel, - MemoryDevice2DeviceSemaphore, - TcpBootstrap, - Transport, - TransportFlags, - DataType, - ErrorCode, - Executor, - ExecutionPlan, - PacketType, - RawGpuBuffer, - ReduceOp, + CppEndpointConfig as EndpointConfig, + CppEndpointConfigIb as EndpointConfigIb, + CppIbMode as IbMode, + CppFifo as Fifo, + CppSemaphore as Semaphore, + CppHost2DeviceSemaphore as Host2DeviceSemaphore, + CppHost2HostSemaphore as Host2HostSemaphore, + cpp_numa as numa, + CppProxyService as ProxyService, + CppRegisteredMemory as RegisteredMemory, + CppPortChannel as PortChannel, + CppMemoryChannel as MemoryChannel, + CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore, + CppTcpBootstrap as TcpBootstrap, + CppTransport as Transport, + CppTransportFlags as TransportFlags, + CppDataType as DataType, + CppErrorCode as ErrorCode, + CppExecutor as Executor, + CppExecutionPlan as ExecutionPlan, + CppPacketType as PacketType, + CppRawGpuBuffer as RawGpuBuffer, + CppReduceOp as ReduceOp, env, is_nvls_supported, - npkit, + cpp_npkit as npkit, ) __all__ = [ @@ -61,6 +63,8 @@ __all__ = [ "Connection", "connect_nvls_collective", "EndpointConfig", + "EndpointConfigIb", + "IbMode", "ErrorCode", "Fifo", "Semaphore", diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py index 6d0e0108..6a6f5f28 100644 --- a/python/mscclpp/__main__.py +++ b/python/mscclpp/__main__.py @@ -6,7 +6,7 @@ import shutil import argparse from pathlib import Path -from mscclpp.language import default_algos as def_algo +from mscclpp import default_algos as def_algo from mscclpp.language.collectives import * from mscclpp.language.utils import AlgoSpec @@ -57,7 +57,7 @@ default_algo_configs = [ def create_default_plans(): - plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default") + plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default") plan_path = Path(plan_dir) if plan_path.exists(): shutil.rmtree(plan_path) diff --git a/python/mscclpp/_core/__init__.py b/python/mscclpp/_core/__init__.py index e9d886f3..a97c91a0 100644 --- a/python/mscclpp/_core/__init__.py +++ b/python/mscclpp/_core/__init__.py @@ -5,9 +5,3 @@ from .algorithm import * from .comm import * from .compiler import * from .buffer import * - -__all__ = [] -__all__ += algorithm.__all__ -__all__ += comm.__all__ -__all__ += compiler.__all__ -__all__ += buffer.__all__ diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py index abaac60c..f12a3027 100644 --- a/python/mscclpp/_core/algorithm.py +++ b/python/mscclpp/_core/algorithm.py @@ -4,18 +4,22 @@ from __future__ import annotations from typing import Optional, Tuple, Dict from functools import cached_property +import cupy as cp from mscclpp._mscclpp import ( - Algorithm as _Algorithm, - DslAlgorithm as _DslAlgorithm, - AlgorithmType as _AlgorithmType, - Communicator, - CollectiveBufferMode, - DataType, - Executor, - ExecutionPlan, - ReduceOp, + CppAlgorithm, + CppDslAlgorithm, + CppAlgorithmType, + CppCommunicator, + CppCollectiveBufferMode, + CppDataType, + CppExecutor, + CppExecutionPlan, + CppReduceOp, + CppAlgorithmBuilder, + CppAlgorithmCollection, + cpp_get_flag_buffer, ) __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"] @@ -45,7 +49,7 @@ class Algorithm: """ def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0): - self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node) + self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node) @property def world_size(self) -> int: @@ -58,23 +62,23 @@ class Algorithm: def __init__( self, id: Optional[str] = None, - execution_plan: Optional[ExecutionPlan] = None, - native_handle: Optional[_Algorithm] = None, + execution_plan: Optional[CppExecutionPlan] = None, + native_handle: Optional[CppAlgorithm] = None, tags: Optional[Dict[str, int]] = None, constraint: Optional[Constraint] = None, ): if execution_plan is not None: - self._algorithm = _DslAlgorithm( + self._algorithm = CppDslAlgorithm( id, execution_plan, tags=tags if tags is not None else {}, - constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(), + constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(), ) elif native_handle is not None: self._algorithm = native_handle @classmethod - def create_from_native_handle(cls, handle: _Algorithm): + def create_from_native_handle(cls, handle: CppAlgorithm): """Create an Algorithm instance from a native C++ algorithm handle. Args: @@ -97,7 +101,7 @@ class Algorithm: Returns: A new Algorithm instance wrapping the algorithm from the capsule. """ - handle = _Algorithm.from_native_capsule(obj) + handle = CppAlgorithm.from_native_capsule(obj) return cls(native_handle=handle) @cached_property @@ -110,18 +114,31 @@ class Algorithm: """The collective operation this algorithm implements (e.g., "allreduce", "allgather").""" return self._algorithm.collective - @cached_property + @property def message_size_range(self) -> Tuple[int, int]: """The valid message size range (min_size, max_size) in bytes.""" return (self._algorithm.message_range[0], self._algorithm.message_range[1]) + def set_message_size_range(self, min_message_size: int, max_message_size: int): + """Set the valid message size range in bytes. + + Args: + min_message_size: Minimum supported message size in bytes. + max_message_size: Maximum supported message size in bytes. + + Only supported for native algorithms. Raises TypeError for DSL algorithms. + """ + if self.is_dsl_algorithm(): + raise TypeError("set_message_size_range is only supported for native algorithms") + self._algorithm.set_message_size_range(min_message_size, max_message_size) + @cached_property def tags(self) -> Dict[str, int]: """Dictionary of tag names to tag values for algorithm selection hints.""" return self._algorithm.tags @cached_property - def buffer_mode(self) -> CollectiveBufferMode: + def buffer_mode(self) -> CppCollectiveBufferMode: """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY).""" return self._algorithm.buffer_mode @@ -131,7 +148,7 @@ class Algorithm: Returns: True if this algorithm is defined using DSL/execution plan, False otherwise. """ - if self._algorithm.type == _AlgorithmType.DSL: + if self._algorithm.type == CppAlgorithmType.DSL: return True return False @@ -141,24 +158,26 @@ class Algorithm: Returns: True if this algorithm is implemented natively, False otherwise. """ - if self._algorithm.type == _AlgorithmType.NATIVE: + if self._algorithm.type == CppAlgorithmType.NATIVE: return True return False def execute( self, - comm: Communicator, + comm: CppCommunicator, input_buffer: int, output_buffer: int, input_size: int, output_size: int, - dtype: DataType, - op: ReduceOp = ReduceOp.NOP, + dtype: CppDataType, + op: CppReduceOp = CppReduceOp.NOP, stream: int = 0, - executor: Optional[Executor] = None, + executor: Optional[CppExecutor] = None, nblocks=0, nthreads_per_block=0, + symmetric_memory: bool = False, extras: Optional[Dict[str, int]] = None, + accum_dtype: Optional[CppDataType] = None, ) -> int: """Execute the collective algorithm. @@ -174,11 +193,16 @@ class Algorithm: executor: The executor for DSL algorithms (required for DSL, optional for native). nblocks: Number of CUDA blocks (0 for auto-selection). nthreads_per_block: Number of threads per block (0 for auto-selection). + symmetric_memory: Whether to use symmetric memory optimization (default: False). extras: Additional algorithm-specific parameters. + accum_dtype: Data type for accumulation during reduction. If None, defaults to + the same as dtype. Use DataType.float32 for high-precision FP8 accumulation. Returns: The result code (0 for success). """ + merged_extras = dict(extras) if extras is not None else {} + accum_dtype = accum_dtype if accum_dtype is not None else dtype return self._algorithm.execute( comm, int(input_buffer), @@ -191,12 +215,18 @@ class Algorithm: executor, nblocks, nthreads_per_block, - extras if extras is not None else {}, + symmetric_memory, + merged_extras, + int(accum_dtype), ) + def reset(self): + """Reset the internal state of the algorithm, if applicable.""" + self._algorithm.reset() + class AlgorithmBuilder: - def __init__(self, algorithm_builder: _AlgorithmBuilder): + def __init__(self, algorithm_builder: CppAlgorithmBuilder): self._algorithm_builder = algorithm_builder def build(self) -> Algorithm: @@ -204,7 +234,7 @@ class AlgorithmBuilder: class AlgorithmCollection: - def __init__(self, native_collection: _AlgorithmCollection): + def __init__(self, native_collection: CppAlgorithmCollection): self._native_collection = native_collection self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()] @@ -228,3 +258,24 @@ class AlgorithmCollection: """Register an algorithm for a collective operation.""" self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm) self._algorithms.append(algorithm) + + +_flag_buffer_cache = None + + +def get_flag_buffer() -> cp.ndarray: + """Get the default flag buffer for algorithm selection. + + This buffer is used internally by default algorithms to store selection flags. + It is allocated as a shared GPU buffer and can be accessed from Python. + The result is cached so all callers share the same buffer. + + Returns: + A CuPy array representing the flag buffer on the GPU. + """ + global _flag_buffer_cache + if _flag_buffer_cache is None: + buffer_ptr, buffer_size, owner = cpp_get_flag_buffer() + memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0) + _flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr) + return _flag_buffer_cache diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py index b54342ea..0575ca68 100644 --- a/python/mscclpp/_core/buffer.py +++ b/python/mscclpp/_core/buffer.py @@ -6,7 +6,7 @@ from typing import Union, Tuple import cupy as cp import numpy as np -from mscclpp._mscclpp import RawGpuBuffer +from mscclpp._mscclpp import CppRawGpuBuffer __all__ = ["GpuBuffer"] @@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray): if any(s <= 0 for s in shape): raise ValueError("Shape must be positive.") # Create the buffer - buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize) + buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize) memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0) return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr) diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py index 2b5a5f25..d42349dd 100644 --- a/python/mscclpp/_core/comm.py +++ b/python/mscclpp/_core/comm.py @@ -6,21 +6,21 @@ from typing import Type import cupy as cp from mscclpp._mscclpp import ( - Communicator, - Connection, + CppCommunicator, + CppConnection, connect_nvls_collective, - EndpointConfig, - Semaphore, - ProxyService, - RegisteredMemory, - PortChannel, - MemoryChannel, - TcpBootstrap, - Transport, - TransportFlags, + CppEndpointConfig, + CppSemaphore, + CppProxyService, + CppRegisteredMemory, + CppPortChannel, + CppMemoryChannel, + CppTcpBootstrap, + CppTransport, + CppTransportFlags, ) -import mpi4py import numpy as np +import pickle from mscclpp.utils import is_torch_tensor @@ -29,27 +29,47 @@ __all__ = ["CommGroup"] class CommGroup: def __init__( - self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None + self, + mpi_comm: "mpi4py.MPI.Comm" = None, + torch_group: "dist.ProcessGroup" = None, + interfaceIpPortTrio: str = "", + rank: int = None, + size: int = None, ): - if interfaceIpPortTrio == "": - self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) + if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None): uniq_id = None - if mpi_comm.rank == 0: - # similar to NCCL's unique id + rank, size = ( + (mpi_comm.Get_rank(), mpi_comm.Get_size()) + if mpi_comm is not None + else (torch_group.rank(), torch_group.size()) + ) + self.bootstrap = CppTcpBootstrap.create(rank, size) + if rank == 0: uniq_id = self.bootstrap.create_unique_id() - uniq_id_global = mpi_comm.bcast(uniq_id, 0) + if mpi_comm is not None: + import mpi4py + + uniq_id_global = mpi_comm.bcast(uniq_id, 0) + else: + import torch + import torch.distributed as dist + + if rank == 0: + uniq_id_global = uniq_id + pickled_data = pickle.dumps(uniq_id) + data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone() + else: + data_tensor = torch.zeros(256, dtype=torch.uint8) + dist.broadcast(data_tensor, src=0, group=torch_group) + uniq_id_global = pickle.loads(data_tensor.numpy().tobytes()) self.bootstrap.initialize(uniq_id_global) - elif mpi_comm: - # use this instead - self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) - self.bootstrap.initialize(interfaceIpPortTrio) elif not interfaceIpPortTrio == "": assert rank >= 0 and size >= 1 - self.bootstrap = TcpBootstrap.create(rank, size) + self.bootstrap = CppTcpBootstrap.create(rank, size) self.bootstrap.initialize(interfaceIpPortTrio) else: raise RuntimeError("Either the interface or mpi_group need to be specified") - self.communicator = Communicator(self.bootstrap) + self.communicator = CppCommunicator(self.bootstrap) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() @@ -63,43 +83,43 @@ class CommGroup: def recv(self, tensor: np.ndarray, peer: int, tag: int): self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag) - def my_ib_device(self, local_rank: int) -> Transport: + def my_ib_device(self, local_rank: int) -> CppTransport: if local_rank == 0: - return Transport.IB0 + return CppTransport.IB0 if local_rank == 1: - return Transport.IB1 + return CppTransport.IB1 if local_rank == 2: - return Transport.IB2 + return CppTransport.IB2 if local_rank == 3: - return Transport.IB3 + return CppTransport.IB3 if local_rank == 4: - return Transport.IB4 + return CppTransport.IB4 if local_rank == 5: - return Transport.IB5 + return CppTransport.IB5 if local_rank == 6: - return Transport.IB6 + return CppTransport.IB6 if local_rank == 7: - return Transport.IB7 + return CppTransport.IB7 else: assert False # only 8 IBs are supported def make_connection( self, all_ranks: list[int], - endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport], + endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport], use_switch: bool = False, - ) -> dict[int, Connection]: - if type(endpoints) is Transport: - endpoints = EndpointConfig(endpoints) + ) -> dict[int, CppConnection]: + if type(endpoints) is CppTransport: + endpoints = CppEndpointConfig(endpoints) elif type(endpoints) is dict: - endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()} + endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()} connections = {} for rank in all_ranks: if type(endpoints) is dict: endpoint = endpoints[rank] else: endpoint = endpoints - if endpoint.transport == Transport.CudaIpc and use_switch: + if endpoint.transport == CppTransport.CudaIpc and use_switch: return connect_nvls_collective(self.communicator, all_ranks, 2**30) else: connections[rank] = self.communicator.connect(endpoint, rank) @@ -107,8 +127,8 @@ class CommGroup: return connections def register_tensor_with_connections( - self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection] - ) -> dict[int, RegisteredMemory]: + self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection] + ) -> dict[int, CppRegisteredMemory]: local_reg_memory = self.register_local_memory(tensor, connections) all_registered_memories = {} all_registered_memories[self.my_rank] = local_reg_memory @@ -121,8 +141,8 @@ class CommGroup: return all_registered_memories def _register_memory_with_connections( - self, memory: RegisteredMemory, connections: dict[int, Connection] - ) -> dict[int, RegisteredMemory]: + self, memory: CppRegisteredMemory, connections: dict[int, CppConnection] + ) -> dict[int, CppRegisteredMemory]: all_registered_memories = {} all_registered_memories[self.my_rank] = memory future_memories = {} @@ -133,18 +153,20 @@ class CommGroup: all_registered_memories[rank] = future_memories[rank].get() return all_registered_memories - def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]: + def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]: future_semaphores = {} for rank in connections: future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank) return {rank: future.get() for rank, future in future_semaphores.items()} - def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]: + def make_memory_channels( + self, tensor: cp.ndarray, connections: dict[int, CppConnection] + ) -> dict[int, CppMemoryChannel]: semaphores = self.make_semaphores(connections) registered_memories = self.register_tensor_with_connections(tensor, connections) channels = {} for rank in connections: - channels[rank] = MemoryChannel( + channels[rank] = CppMemoryChannel( semaphores[rank], registered_memories[rank], registered_memories[self.my_rank] ) return channels @@ -152,9 +174,9 @@ class CommGroup: def make_memory_channels_with_scratch( self, tensor: cp.ndarray, - registeredScratchBuffer: RegisteredMemory, - connections: dict[int, Connection], - ) -> dict[int, MemoryChannel]: + registeredScratchBuffer: CppRegisteredMemory, + connections: dict[int, CppConnection], + ) -> dict[int, CppMemoryChannel]: semaphores = self.make_semaphores(connections) registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections) channels = {} @@ -162,17 +184,17 @@ class CommGroup: tensor_size = ( tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize ) - local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags()) + local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags()) scratch_data_ptr = registeredScratchBuffer.data() for rank in connections: - channels[rank] = MemoryChannel( + channels[rank] = CppMemoryChannel( semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr ) return channels def make_port_channels( - self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] - ) -> dict[int, PortChannel]: + self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection] + ) -> dict[int, CppPortChannel]: semaphores = self.make_semaphores(connections) registered_memories = self.register_tensor_with_connections(tensor, connections) memory_ids = {} @@ -188,12 +210,12 @@ class CommGroup: def make_port_channels_with_scratch( self, - proxy_service: ProxyService, + proxy_service: CppProxyService, tensor: cp.ndarray, - registeredScratchBuffer: RegisteredMemory, - connections: dict[int, Connection], - ) -> dict[int, PortChannel]: - transport_flags = TransportFlags() + registeredScratchBuffer: CppRegisteredMemory, + connections: dict[int, CppConnection], + ) -> dict[int, CppPortChannel]: + transport_flags = CppTransportFlags() for rank in connections: transport_flags |= connections[rank].transport() data_ptr = ( @@ -223,8 +245,8 @@ class CommGroup: return channels def register_semaphore_with_proxy( - self, proxy_service: ProxyService, connections: dict[int, Connection] - ) -> dict[int, PortChannel]: + self, proxy_service: CppProxyService, connections: dict[int, CppConnection] + ) -> dict[int, CppPortChannel]: semaphores = self.make_semaphores(connections) semaphore_ids = {} for rank in semaphores: @@ -235,7 +257,7 @@ class CommGroup: return channels def register_memory_with_proxy( - self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] + self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection] ) -> dict[int, int]: registered_memories = self.register_tensor_with_connections(tensor, connections) memory_ids = {} @@ -243,8 +265,8 @@ class CommGroup: memory_ids[rank] = proxy_service.add_memory(registered_memories[rank]) return memory_ids - def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory: - transport_flags = TransportFlags() + def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory: + transport_flags = CppTransportFlags() for rank in connections: transport_flags |= connections[rank].transport() data_ptr = ( diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py index 82ae93a9..b2da976d 100644 --- a/python/mscclpp/_core/compiler.py +++ b/python/mscclpp/_core/compiler.py @@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram from mscclpp.language.utils import AlgoSpec from mscclpp.utils import get_device_arch -from mscclpp._mscclpp import ( - ExecutionPlan, -) +from mscclpp._mscclpp import CppExecutionPlan, env logging.basicConfig(level=logging.INFO) @@ -51,7 +49,7 @@ class DslCompiler: into execution plans that can be run on GPUs. The compiled plans are cached to disk for reuse. - The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR` + The cache location can be configured via the `MSCCLPP_CACHE_DIR` environment variable (defaults to `~/.cache/mscclpp`). Example: @@ -138,7 +136,7 @@ class DslCompiler: ) ).hexdigest() - plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp") + plan_dir = Path(env().cache_dir) os.makedirs(plan_dir, exist_ok=True) filename = f"{plan_id}.json" plan_path = os.path.join(plan_dir, filename) @@ -157,7 +155,7 @@ class DslCompiler: os.remove(tmp_path) except Exception: Path(plan_path).unlink(missing_ok=True) - execution_plan = ExecutionPlan(plan_path, rank) + execution_plan = CppExecutionPlan(plan_path, rank) return Algorithm( id=plan_id, execution_plan=execution_plan, @@ -179,8 +177,8 @@ class NativeCodeCompiler: based on the runtime environment. Compiled modules are cached to avoid recompilation. - The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR` - environment variable (defaults to `~/.cache/mscclpp/native`). + The cache location can be configured via the `MSCCLPP_CACHE_DIR` + environment variable (defaults to `~/.cache/mscclpp`). Attributes: _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA. @@ -226,8 +224,7 @@ class NativeCodeCompiler: "-L" + os.path.join(self._lib_home, "lib"), "-lmscclpp", ] - cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native") - self._cache_dir = Path(cache_root) + self._cache_dir = Path(env().cache_dir) / "native" self._cache_dir.mkdir(parents=True, exist_ok=True) def _get_compiler(self) -> str: @@ -283,7 +280,7 @@ class NativeCodeCompiler: Note: - The source file should include pybind11 bindings to expose functions. - MSCCLPP headers are automatically included in the compilation. - - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native). + - The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp). - File locking is used to prevent race conditions during parallel compilation. Example: diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py index 51a178fb..ddfb929f 100644 --- a/python/mscclpp/ext/algorithm_collection_builder.py +++ b/python/mscclpp/ext/algorithm_collection_builder.py @@ -3,12 +3,10 @@ from __future__ import annotations from typing import Union -from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection +from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer import atexit -from mscclpp._mscclpp import ( - AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder, -) +from mscclpp._mscclpp import CppAlgorithmCollectionBuilder __all__ = ["AlgorithmCollectionBuilder"] @@ -24,13 +22,14 @@ class AlgorithmCollectionBuilder: @classmethod def reset(cls): if cls._instance is not None: - _AlgorithmCollectionBuilder.reset() + CppAlgorithmCollectionBuilder.reset() cls._instance = None def __init__(self): if not hasattr(self, "_initialized"): - self._builder = _AlgorithmCollectionBuilder.get_instance() + self._builder = CppAlgorithmCollectionBuilder.get_instance() self._initialized = True + self._flag_buffer = None def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]): if isinstance(algorithm_builder, AlgorithmBuilder): @@ -52,8 +51,17 @@ class AlgorithmCollectionBuilder: collection = self._builder.build() return AlgorithmCollection(collection) - def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection: - native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank) + def build_default_algorithms( + self, + scratch_buffer: int, + scratch_buffer_size: int, + rank: int, + ) -> AlgorithmCollection: + if self._flag_buffer is None: + self._flag_buffer = get_flag_buffer() + native_collection = self._builder.build_default_algorithms( + int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank + ) return AlgorithmCollection(native_collection) diff --git a/python/mscclpp/ext/alltoallv_single.py b/python/mscclpp/ext/alltoallv_single.py index 2a29b3f5..e45ef950 100644 --- a/python/mscclpp/ext/alltoallv_single.py +++ b/python/mscclpp/ext/alltoallv_single.py @@ -24,11 +24,11 @@ def _a2av_dbg(msg: str): if _DEBUG_A2AV: print(msg, file=sys.stderr, flush=True) from mscclpp._mscclpp import ( - Communicator, - TcpBootstrap, - DataType, - ReduceOp, - CommResult, + CppCommunicator as Communicator, + CppTcpBootstrap as TcpBootstrap, + CppDataType as DataType, + CppReduceOp as ReduceOp, + CppCommResult as CommResult, ) from mscclpp.ext.algorithm_collection_builder import AlgorithmCollectionBuilder @@ -375,6 +375,7 @@ class MscclppAlltoAllV: None, # executor (not needed for native algos) 0, # nblocks (auto) 0, # nthreads_per_block (auto) + False, # symmetric_memory self._extras, ) diff --git a/python/mscclpp/language/channel.py b/python/mscclpp/language/channel.py index 1b22e4e2..23d76eda 100644 --- a/python/mscclpp/language/channel.py +++ b/python/mscclpp/language/channel.py @@ -140,7 +140,7 @@ class MemoryChannel: for tb_id in tb_list: tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb_id, remote_chunk, self.channel_type) - tb_channel_ids = get_program().setup_channel(tb, self) + tb_channel_ids = get_program().setup_channel(tb_id, self) op = GetOperation( src_buff=[RemoteChunk(src_chunk.buffer, src_chunk.index, src_chunk.size, tb_chunk_id)], dst_buff=[LocalChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size)], diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py index 127f4a03..5fb392e3 100644 --- a/python/mscclpp/language/internal/operations.py +++ b/python/mscclpp/language/internal/operations.py @@ -534,6 +534,7 @@ class PutOperation(BaseOperation): self.dst_buff = dst_buff self.channel_ids = channel_ids self.channel_type = channel_type + self.from_packet = from_packet self.to_packet = to_packet self.with_signal = with_signal self.with_signal_and_flush = with_signal_and_flush @@ -579,6 +580,25 @@ class PutOperation(BaseOperation): with_signal=self.with_signal, with_signal_and_flush=self.with_signal_and_flush, ) + elif ( + isinstance(other, PutOperation) + and self.name == Instruction.read_put_packet + and self.name == other.name + and self.src_buff == other.src_buff + and self.channel_type == other.channel_type + and self.tbg_info == other.tbg_info + ): + fused_operation = PutOperation( + src_buff=self.src_buff, + dst_buff=self.dst_buff + other.dst_buff, + channel_ids=self.channel_ids + other.channel_ids, + channel_type=self.channel_type, + tbg_info=self.tbg_info, + from_packet=self.from_packet, + to_packet=self.to_packet, + with_signal=self.with_signal, + with_signal_and_flush=self.with_signal_and_flush, + ) return fused_operation @@ -725,7 +745,7 @@ class ReduceOperation(BaseOperation): remote_dst_buff=self.remote_dst_buff + other.dst_buff, channel_ids=self.channel_ids, put_channel_ids=self.put_channel_ids + other.channel_ids, - channel_type=self.channel_type, + channel_type=other.channel_type, reduce_operation=self.reduce_operation, tbg_info=self.tbg_info, packet=self.packet, diff --git a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py new file mode 100644 index 00000000..bda9e36c --- /dev/null +++ b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Creating Scratch Buffers + scratch_buffer = [] + for gpu in range(gpu_size): + scratch_buffer.append(Buffer(gpu, 2 * gpu_size)) + + # Copying it to scratch buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + scratch_offset = gpu_size + input_buffer = rank.get_input_buffer() + rank.copy_packets( + scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0 + ) + + # Putting packets in the remote scratch buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + output_buffer = rank.get_output_buffer() + for peer in range(1, gpu_size): + dst_rank = (gpu + peer) % gpu_size + ch = MemoryChannel(dst_rank, gpu) + tb = 0 + ch.read_put_packets( + scratch_buffer[dst_rank][gpu : gpu + 1], + scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], + tb, + ) + + # Copying packets from local scratch buffer to local buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + output_buffer = rank.get_output_buffer() + for peer in range(1, gpu_size): + dst_rank = (gpu + peer) % gpu_size + rank.unpack_packets( + output_buffer[dst_rank : dst_rank + 1], + scratch_buffer[gpu][dst_rank : dst_rank + 1], + tb=0, + ) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_gpus", type=int, help="number of gpus") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 783b0ca9..93cd786b 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -11,7 +11,7 @@ from typing import Any, Type, Union import cupy as cp import numpy as np -from mscclpp._mscclpp import DataType +from mscclpp._mscclpp import CppDataType as DataType try: import torch @@ -192,5 +192,13 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType: return DataType.int32 elif dtype == torch.bfloat16: return DataType.bfloat16 + # Hardware supports either OCP format or FNUZ format for float8. + # Mapping both to the same MSCClPP data type. + elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz: + return DataType.float8_e5m2 + elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz: + return DataType.float8_e4m3 + elif dtype == torch.uint8: + return DataType.uint8 else: raise ValueError(f"Unknown data type: {dtype}") diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt index b49a404c..49cf13bc 100644 --- a/python/requirements_cuda13.txt +++ b/python/requirements_cuda13.txt @@ -6,4 +6,5 @@ pytest numpy matplotlib sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed -blake3 \ No newline at end of file +blake3 +pybind11 \ No newline at end of file diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt index e69de29b..7ed4fef3 100644 --- a/python/requirements_rocm6.txt +++ b/python/requirements_rocm6.txt @@ -0,0 +1,10 @@ +mpi4py +cupy +prettytable +netifaces +pytest +numpy +matplotlib +sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +blake3 +pybind11 \ No newline at end of file diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp index 5bc18e23..697a5c38 100644 --- a/python/test/_cpp/proxy_test.cpp +++ b/python/test/_cpp/proxy_test.cpp @@ -63,10 +63,13 @@ class MyProxyService { }; NB_MODULE(_ext, m) { +#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS + nb::set_leak_warnings(false); +#endif nb::class_(m, "MyProxyService") .def(nb::init(), nb::arg("rank"), nb::arg("nranks"), nb::arg("data_size"), nb::arg("reg_mem_list"), nb::arg("sem_list")) .def("fifo_device_handle", &MyProxyService::fifoDeviceHandle) .def("start", &MyProxyService::start) .def("stop", &MyProxyService::stop); -} +} \ No newline at end of file diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 49e5166f..59bc1661 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -11,7 +11,7 @@ from mscclpp import ( env, ) from mscclpp import CommGroup, GpuBuffer -from mscclpp.utils import KernelBuilder, GpuBuffer, pack +from mscclpp.utils import KernelBuilder, pack import os import struct diff --git a/python/test/test_alltoallv_mscclpp.py b/python/test/test_alltoallv_mscclpp.py index e8797e43..d45fb6f4 100644 --- a/python/test/test_alltoallv_mscclpp.py +++ b/python/test/test_alltoallv_mscclpp.py @@ -130,11 +130,11 @@ def main(): print("=" * 60) # Import after torch.distributed init - from mscclpp._mscclpp import ( + from mscclpp import ( Communicator, TcpBootstrap, - UniqueId, ) + from mscclpp._mscclpp import CppUniqueId as UniqueId from mscclpp.ext.alltoallv_single import MscclppAlltoAllV # Create mscclpp communicator with TcpBootstrap diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py new file mode 100644 index 00000000..82981ce1 --- /dev/null +++ b/python/test/test_fp8_accum.py @@ -0,0 +1,397 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Correctness test for FP8 allreduce with different accumulation types. +# +# Verifies that FP8 allreduce with higher-precision accumulation produces +# results at least as accurate as native FP8 accumulation, by comparing +# against a float32 reference. +# +# Usage: +# mpirun -np 8 pytest python/test/test_fp8_accum.py -v + +import cupy as cp +import numpy as np +import pytest + +from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported +from mscclpp.ext import AlgorithmCollectionBuilder +from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group + +# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs. +# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed. +_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip +_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89 +pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA") + +# --------------------------------------------------------------------------- +# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7) +# --------------------------------------------------------------------------- + + +def e4m3fn_to_float(uint8_array): + """Decode a cupy uint8 array of E4M3FN bit patterns to float32.""" + bits = uint8_array.astype(cp.int32) + sign = (bits >> 7) & 1 + exp = (bits >> 3) & 0xF + mant = bits & 0x7 + + # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8) + normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32)) + # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8) + subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6)) + + result = cp.where(exp == 0, subnormal_val, normal_val) + result = cp.where(sign == 1, -result, result) + # Zero + result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result) + # NaN: exp==15 & mant==7 + nan_mask = (exp == 15) & (mant == 7) + result = cp.where(nan_mask, cp.float32(float("nan")), result) + return result + + +def float_to_e4m3fn(f32_array, chunk_size=65536): + """Encode a cupy float32 array to uint8 E4M3FN bit patterns. + + Uses a lookup-table approach: precompute all 128 positive E4M3FN values, + then find nearest match per element via chunked broadcast comparison. + """ + # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F) + all_bytes = cp.arange(128, dtype=cp.uint8) + all_floats = e4m3fn_to_float(all_bytes) # (128,) float32 + # Mark NaN entries as inf so they're never selected as nearest + all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) + + # Clamp input and extract sign + clamped = f32_array.astype(cp.float32) + clamped = cp.clip(clamped, -448.0, 448.0) + signs = (clamped < 0).astype(cp.uint8) + absval = cp.abs(clamped) + + result = cp.zeros(absval.shape, dtype=cp.uint8) + n = absval.size + absval_flat = absval.ravel() + result_flat = result.ravel() + + for start in range(0, n, chunk_size): + end = min(start + chunk_size, n) + chunk = absval_flat[start:end] + # (chunk_size, 128) difference matrix + diffs = cp.abs(chunk[:, None] - all_floats[None, :]) + result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8) + + # Combine with sign bit + result = result_flat.reshape(absval.shape) + result = result | (signs << 7) + # Handle exact zero + result = cp.where(absval == 0, cp.uint8(0), result) + return result + + +# --------------------------------------------------------------------------- +# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80) +# --------------------------------------------------------------------------- + + +def e4m3b15_to_float(uint8_array): + """Decode a cupy uint8 array of E4M3B15 bit patterns to float32.""" + bits = uint8_array.astype(cp.int32) + sign = (bits >> 7) & 1 + exp = (bits >> 3) & 0xF + mant = bits & 0x7 + + # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8) + normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32)) + # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8) + subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14)) + + result = cp.where(exp == 0, subnormal_val, normal_val) + result = cp.where(sign == 1, -result, result) + # Zero + result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result) + # NaN: exp==15 or negative zero (0x80) + nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80) + result = cp.where(nan_mask, cp.float32(float("nan")), result) + return result + + +def float_to_e4m3b15(f32_array, chunk_size=65536): + """Encode a cupy float32 array to uint8 E4M3B15 bit patterns. + + Same lookup-table approach as float_to_e4m3fn. + """ + # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F) + all_bytes = cp.arange(128, dtype=cp.uint8) + all_floats = e4m3b15_to_float(all_bytes) # (128,) float32 + # Mark NaN entries as inf so they're never selected as nearest + all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) + + # Clamp input and extract sign + clamped = f32_array.astype(cp.float32) + clamped = cp.clip(clamped, -0.9375, 0.9375) + signs = (clamped < 0).astype(cp.uint8) + absval = cp.abs(clamped) + + result = cp.zeros(absval.shape, dtype=cp.uint8) + n = absval.size + absval_flat = absval.ravel() + result_flat = result.ravel() + + for start in range(0, n, chunk_size): + end = min(start + chunk_size, n) + chunk = absval_flat[start:end] + # (chunk_size, 128) difference matrix + diffs = cp.abs(chunk[:, None] - all_floats[None, :]) + result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8) + + # Combine with sign bit + result = result_flat.reshape(absval.shape) + result = result | (signs << 7) + # Handle exact zero + result = cp.where(absval == 0, cp.uint8(0), result) + return result + + +# --------------------------------------------------------------------------- +# Shared test helpers +# --------------------------------------------------------------------------- + + +def setup_algorithms(mpi_group): + """Build default algorithms and return (comm_group, algo_map, scratch_buf).""" + comm_group = CommGroup(mpi_group.comm) + scratch = GpuBuffer(1 << 27, dtype=cp.uint8) # 128 MB + AlgorithmCollectionBuilder.reset() + builder = AlgorithmCollectionBuilder() + algorithms = builder.build_default_algorithms( + scratch_buffer=scratch.data.ptr, + scratch_buffer_size=scratch.nbytes, + rank=comm_group.my_rank, + ) + algo_map = {a.name: a for a in algorithms} + return comm_group, algo_map, scratch + + +def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0): + """Run allreduce in-place on buffer and return a copy of the result.""" + ret = algo.execute( + comm=comm_group.communicator, + input_buffer=buffer.data.ptr, + output_buffer=buffer.data.ptr, + input_size=buffer.nbytes, + output_size=buffer.nbytes, + dtype=dtype, + op=ReduceOp.SUM, + stream=cp.cuda.get_current_stream().ptr, + nblocks=nblocks, + nthreads_per_block=nthreads_per_block, + symmetric_memory=True, + accum_dtype=accum_dtype, + ) + cp.cuda.Device().synchronize() + assert ret == 0, f"Allreduce failed with error code {ret}" + return buffer.copy() + + +# --------------------------------------------------------------------------- +# Test: FP8 E4M3 accumulation correctness +# --------------------------------------------------------------------------- + + +@parametrize_mpi_groups(8) +@pytest.mark.parametrize( + "algo_name", + [ + "default_allreduce_packet", + "default_allreduce_nvls_packet", + "default_allreduce_fullmesh", + "default_allreduce_rsag_zero_copy", + "default_allreduce_allpair_packet", + ], +) +@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576]) +def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): + """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at + least as accurate as native FP8 accumulation, across all algorithm variants.""" + rank = mpi_group.comm.rank + world_size = mpi_group.comm.size + + comm_group, algo_map, scratch = setup_algorithms(mpi_group) + if algo_name not in algo_map: + pytest.skip(f"{algo_name} not available") + if "nvls" in algo_name and not is_nvls_supported(): + pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform") + algo = algo_map[algo_name] + + buf = GpuBuffer(size, dtype=cp.uint8) + + accum_configs = [ + ("fp8_native", DataType.float8_e4m3), + ("float16", DataType.float16), + ("float32", DataType.float32), + ] + + # rsag_zero_copy and fullmesh need explicit block/thread counts + if "rsag" in algo_name: + nb = max(1, min(32, size // (world_size * 32))) + nt = 1024 + elif "fullmesh" in algo_name: + nb = 35 + nt = 512 + else: + nb = 0 + nt = 0 + + errors = {} + for accum_label, accum_dtype in accum_configs: + # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm) + rng = np.random.RandomState(42 + rank) + src_f32 = cp.asarray(rng.randn(size).astype(np.float32)) + src_f32 = cp.clip(src_f32, -240.0, 240.0) + src_fp8 = float_to_e4m3fn(src_f32) + + # Copy into symmetric buffer + buf[:] = src_fp8 + cp.cuda.Device().synchronize() + + # Run allreduce + result = run_allreduce( + algo, + comm_group, + buf, + dtype=DataType.float8_e4m3, + accum_dtype=accum_dtype, + nblocks=nb, + nthreads_per_block=nt, + ) + result_f32 = e4m3fn_to_float(result) + + # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32 + ref_f32 = cp.zeros(size, dtype=cp.float32) + for r in range(world_size): + rng_r = np.random.RandomState(42 + r) + rank_data = cp.asarray(rng_r.randn(size).astype(np.float32)) + rank_data = cp.clip(rank_data, -240.0, 240.0) + rank_data_fp8 = float_to_e4m3fn(rank_data) + ref_f32 += e4m3fn_to_float(rank_data_fp8) + + # Compute errors + abs_err = cp.abs(result_f32 - ref_f32) + mean_abs_err = float(cp.mean(abs_err)) + errors[accum_label] = mean_abs_err + + # Reset between runs + algo.reset() + + # Higher-precision accumulation should be at least as accurate as native fp8 + assert ( + errors["float16"] <= errors["fp8_native"] + 1e-6 + ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})" + assert ( + errors["float32"] <= errors["fp8_native"] + 1e-6 + ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})" + + +# --------------------------------------------------------------------------- +# Test: FP8 E4M3B15 accumulation correctness +# --------------------------------------------------------------------------- + + +@parametrize_mpi_groups(8) +@pytest.mark.parametrize( + "algo_name", + [ + "default_allreduce_packet", + "default_allreduce_nvls_packet", + "default_allreduce_rsag_zero_copy", + "default_allreduce_fullmesh", + "default_allreduce_allpair_packet", + ], +) +@pytest.mark.parametrize("size", [1024, 4096, 65536]) +def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int): + """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at + least as accurate as native E4M3B15 accumulation.""" + rank = mpi_group.comm.rank + world_size = mpi_group.comm.size + + comm_group, algo_map, scratch = setup_algorithms(mpi_group) + if algo_name not in algo_map: + pytest.skip(f"{algo_name} not available") + if "nvls" in algo_name and not is_nvls_supported(): + pytest.skip(f"{algo_name} requires NVLS which is not supported on this platform") + + algo = algo_map[algo_name] + buf = GpuBuffer(size, dtype=cp.uint8) + + accum_configs = [ + ("e4m3b15_native", DataType.float8_e4m3b15), + ("float16", DataType.float16), + ("float32", DataType.float32), + ] + + # rsag_zero_copy needs explicit block/thread counts, scaled to data size + if "rsag" in algo_name: + nb = max(1, min(32, size // (world_size * 32))) + nt = 1024 + else: + nb = 0 + nt = 0 + + errors = {} + for accum_label, accum_dtype in accum_configs: + # Generate deterministic per-rank random uint8 values in valid e4m3b15 range + rng = np.random.RandomState(42 + rank) + raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8)) + signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7 + src_uint8 = raw | signs + # Fix negative zero -> positive zero + src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8) + + # Copy into symmetric buffer + buf[:] = src_uint8 + cp.cuda.Device().synchronize() + + # Run allreduce + result = run_allreduce( + algo, + comm_group, + buf, + dtype=DataType.float8_e4m3b15, + accum_dtype=accum_dtype, + nblocks=nb, + nthreads_per_block=nt, + ) + + # Decode result + result_f32 = e4m3b15_to_float(result) + + # Compute float32 reference + ref_f32 = cp.zeros(size, dtype=cp.float32) + for r in range(world_size): + rng_r = np.random.RandomState(42 + r) + raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8)) + signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7 + bits_r = raw_r | signs_r + bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r) + ref_f32 += e4m3b15_to_float(bits_r) + + # Clamp reference to e4m3b15 representable range + ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375) + + # Compute errors (only on valid entries) + valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32) + abs_err = cp.abs(result_f32[valid] - ref_f32[valid]) + mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0 + errors[accum_label] = mean_abs_err + + algo.reset() + + # Higher-precision accumulation should be at least as accurate as native + assert ( + errors["float16"] <= errors["e4m3b15_native"] + 1e-8 + ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})" + assert ( + errors["float32"] <= errors["e4m3b15_native"] + 1e-8 + ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})" diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index a6899642..6b3119cb 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str): def create_group_and_connection(mpi_group: MpiGroup, connection_type: str): if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink/nvls for cross node") + if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) - try: - connection = create_connection(group, connection_type) - except Error as e: - if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage: - pytest.skip("IB not supported on this node") - raise + connection = create_connection(group, connection_type) return group, connection @@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str, @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) @@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores_gil_release(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index c1aa25bb..9ca5fed3 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -28,6 +28,16 @@ if(MSCCLPP_USE_IB) target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS}) target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES}) target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS) + if(MLX5_FOUND) + target_include_directories(mscclpp_obj SYSTEM PRIVATE ${MLX5_INCLUDE_DIRS}) + target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MLX5DV) + endif() +endif() + +if(MSCCLPP_USE_GDRCOPY) + target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS}) + target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES}) + target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY) endif() set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc index 07da9045..a492ee6a 100644 --- a/src/core/algorithm.cc +++ b/src/core/algorithm.cc @@ -3,6 +3,7 @@ #include #include +#include #include "logger.hpp" @@ -40,19 +41,21 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF CommResult NativeAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) { + bool symmetricMemory, const std::unordered_map& extras, + DataType accumDtype) { + if (accumDtype == DataType::AUTO) accumDtype = dtype; if (!initialized_) { initFunc_(comm); initialized_ = true; } - AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype); + AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory); auto it = contexts_.find(ctxKey); if (it == contexts_.end()) { auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype); contexts_[ctxKey] = ctx; } return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks, - nThreadsPerBlock, extras); + nThreadsPerBlock, extras, accumDtype); } const std::string& NativeAlgorithm::name() const { return name_; } @@ -65,6 +68,11 @@ const std::pair& NativeAlgorithm::messageRange() const { return range; } +void NativeAlgorithm::setMessageSizeRange(size_t minMessageSize, size_t maxMessageSize) { + minMessageSize_ = minMessageSize; + maxMessageSize_ = maxMessageSize; +} + const std::unordered_map& NativeAlgorithm::tags() const { return tags_; } const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferMode_; } @@ -142,6 +150,10 @@ const std::pair& DslAlgorithm::messageRange() const { return range; } +void DslAlgorithm::setMessageSizeRange(size_t, size_t) { + THROW(EXEC, Error, ErrorCode::InvalidUsage, "setMessageSizeRange is only supported for native algorithms"); +} + const std::unordered_map& DslAlgorithm::tags() const { return tags_; } const CollectiveBufferMode& DslAlgorithm::bufferMode() const { @@ -155,8 +167,8 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; } CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream, - std::shared_ptr executor, int, int, - const std::unordered_map&) { + std::shared_ptr executor, int, int, bool, + const std::unordered_map&, DataType) { if (!executor) { THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute"); } @@ -173,15 +185,19 @@ CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* stream); break; #if defined(__FP8_TYPES_EXIST__) - case DataType::FP8_E4M3: - executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FP8_E4M3, plan_, - stream); + case DataType::FLOAT8_E4M3: + executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3, + plan_, stream); break; - case DataType::FP8_E5M2: - executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FP8_E5M2, plan_, - stream); + case DataType::FLOAT8_E5M2: + executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2, + plan_, stream); break; #endif + case DataType::FLOAT8_E4M3B15: + executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize, + DataType::FLOAT8_E4M3B15, plan_, stream); + break; case DataType::INT32: case DataType::UINT32: executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream); @@ -198,4 +214,23 @@ std::shared_ptr DslAlgorithm::build() { return shared_from_this(); } // TODO: implement this void DslAlgorithm::reset() {} +static uint32_t* gDefaultFlagBuffer = nullptr; +static std::weak_ptr gDefaultFlagBufferWeak; +static size_t gDefaultFlagCount = 128; + +std::pair, size_t> getFlagBuffer() { + auto ptr = gDefaultFlagBufferWeak.lock(); + if (!ptr) { + if (!gDefaultFlagBuffer) { + // Intentionally never freed — CUDA driver reclaims GPU memory at process exit. + gDefaultFlagBuffer = static_cast(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t))); + std::vector initFlags(gDefaultFlagCount, 1); + mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice); + } + ptr = std::shared_ptr(gDefaultFlagBuffer, [](void*) {}); + gDefaultFlagBufferWeak = ptr; + } + return {ptr, gDefaultFlagCount * sizeof(uint32_t)}; +} + } // namespace mscclpp diff --git a/src/core/communicator.cc b/src/core/communicator.cc index a146f0de..c95ca421 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -4,7 +4,6 @@ #include "communicator.hpp" #include "api.h" -#include "debug.h" namespace mscclpp { diff --git a/src/core/connection.cc b/src/core/connection.cc index 10a43e88..8b6c0afb 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -7,7 +7,8 @@ #include #endif -#include +#include +#include #include #include #include @@ -15,6 +16,7 @@ #include "api.h" #include "context.hpp" #include "endpoint.hpp" +#include "gpu_utils_internal.hpp" #include "logger.hpp" namespace mscclpp { @@ -180,25 +182,185 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // IBConnection +void IBConnection::recvThreadFunc() { + // Set the CUDA device context for this thread + if (localGpuDeviceId_ >= 0) { + cudaError_t err = cudaSetDevice(localGpuDeviceId_); + if (err != cudaSuccess) { + WARN(NET, "IBConnection recvThreadFunc: cudaSetDevice(", localGpuDeviceId_, + ") failed: ", cudaGetErrorString(err)); + return; + } + // Bind this thread to the NUMA node of the local GPU for optimal memory access + int deviceNumaNode = getDeviceNumaNode(localGpuDeviceId_); + if (deviceNumaNode >= 0) { + numaBind(deviceNumaNode); + } + } + + uint32_t lastImmData = 0; + uint64_t immHighBits = 0; + uint64_t newValueHost = 0; + + auto qp = qp_.lock(); + if (!qp) return; + + while (!stopRecvThread_.load(std::memory_order_relaxed)) { + int wcNum = qp->pollRecvCq(); + if (wcNum < 0) { + recvThreadErrorMsg_ = "pollRecvCq failed"; + recvThreadError_.store(true, std::memory_order_release); + WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_); + break; + } + + for (int i = 0; i < wcNum; ++i) { + int status = qp->getRecvWcStatus(i); + if (status != static_cast(WsStatus::Success)) { + // A failed recv WC typically means the QP entered error state (e.g., WR Flushed Error). + // All remaining WRs will also fail — no recovery without QP recreation. Exit the thread + // and set the error flag so the main thread can detect it. + recvThreadErrorMsg_ = std::string("recv work completion failed: ") + qp->getRecvWcStatusString(i); + recvThreadError_.store(true, std::memory_order_release); + WARN(NET, "IBConnection recvThreadFunc: ", recvThreadErrorMsg_); + return; + } + + // Read the lower 32 bits of the token from imm_data. Reconstruct the full 64-bit value + // using wrap-around detection: tokens increase monotonically, so if the new lower 32 bits + // are less than the previous value, the upper 32 bits must have incremented by 1. + uint32_t immData = qp->getRecvWcImmData(i); + if (immData < lastImmData) { + immHighBits += (1ULL << 32); + } + lastImmData = immData; + newValueHost = immHighBits | static_cast(immData); + + // Forward the token to the semaphore's inbound token address via atomicStore + // through the GDRCopy BAR1 mapping. The GPU reads with system-scope acquire. + if (signalAddr_ != 0) { + if (signalGdrMap_ && signalGdrMap_->valid()) { + atomicStore(signalGdrMap_->hostPtr(), newValueHost, memoryOrderRelaxed); + } else { + // For HIP/ROCm. + // NOTE: may need a fix in the future to ensure BAR1 mapping. + *reinterpret_cast(signalAddr_) = newValueHost; + } + } + + // Post another recv for future messages + qp->stageRecv(/*wrId=*/0); + qp->postRecv(); + } + } +} + IBConnection::IBConnection(std::shared_ptr context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) : BaseConnection(context, localEndpoint), transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()), - dummyAtomicSource_(std::make_unique(0)) { + atomicSrc_(std::make_unique(0)), + ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_), + gdrSignalForwarding_(false), + stopRecvThread_(false), + recvThreadError_(false), + localGpuDeviceId_(localEndpoint.device().id), + signalAddr_(0) { qp_ = getImpl(localEndpoint).ibQp_; qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_); qp_.lock()->rts(); - dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_); - validateTransport(dummyAtomicSourceMem_, transport_); - dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_); - INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created"); + atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_); + validateTransport(atomicSrcMem_, transport_); + atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_); + + if (ibNoAtomic_) { +#if defined(MSCCLPP_USE_CUDA) + // On CUDA, HostNoAtomic requires GDRCopy for CPU→GPU signal forwarding through BAR1. + if (!gdrEnabled()) { + THROW(CONN, Error, ErrorCode::InvalidUsage, + "IB host-no-atomic mode on CUDA requires GDRCopy: ", gdrStatusMessage()); + } + gdrSignalForwarding_ = true; +#endif // defined(MSCCLPP_USE_CUDA) + + // On platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200 + // NVLink-C2C), HostNoAtomic requires Data Direct for correct memory ordering. Data Direct + // routes NIC DMA through the PCIe Data Direct engine, bypassing the bridge. It is available + // on Virtual Function (VF) devices. On platforms without such a bridge (x86, non-Grace + // aarch64), HostNoAtomic works without Data Direct. + // + // We cannot reliably detect the bridge at compile time or runtime, so we emit a warning + // when the device is not a VF. If data corruption occurs, switching to VF devices with + // Data Direct or using IbMode::Host with RDMA atomics will resolve it. + { + IbCtx* ibCtx = getImpl(*context).getIbContext(transport_); + if (!ibCtx->isVirtualFunction()) { + WARN(CONN, + "IB HostNoAtomic mode without a Virtual Function (VF) device may cause data corruption " + "on platforms with a CPU-GPU bridge that reorders posted writes (e.g., Grace/GB200). " + "Device ", + ibCtx->getDevName(), + " is not a VF. " + "If you experience data corruption, use VF devices with Data Direct or IbMode::Host."); + } + } + + // Pre-post receive requests for incoming WRITE_WITH_IMM notifications. + // The recv CQE guarantees the preceding data WRITE has been committed to GPU memory. + auto qp = qp_.lock(); + int maxRecvWr = localEndpoint.config().ib.maxRecvWr; + for (int i = 0; i < maxRecvWr; ++i) { + qp->stageRecv(/*wrId=*/0); + } + qp->postRecv(); + // The recv thread is started later in startSignalForwarding() when the semaphore + // provides the signal forwarding destination. This ensures the thread lifetime is + // bounded by the GdrMap lifetime (created before start, destroyed after stop). + INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with signal forwarding (HostNoAtomic) mode"); + } else { + INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode"); + } } +IBConnection::~IBConnection() { stopSignalForwarding(); } + Transport IBConnection::transport() const { return transport_; } Transport IBConnection::remoteTransport() const { return remoteTransport_; } +bool IBConnection::isSignalForwarding() const { return ibNoAtomic_; } + +void IBConnection::startSignalForwarding(std::shared_ptr mem) { + // Set up the forwarding destination and GdrMap, then start the recv thread. + // Order: set address → create GdrMap → start thread. + signalAddr_ = reinterpret_cast(mem.get()); + if (gdrSignalForwarding_) { + signalGdrMap_ = std::make_unique(std::move(mem), localGpuDeviceId_); + } + if (ibNoAtomic_) { + stopRecvThread_.store(false, std::memory_order_relaxed); + recvThread_ = std::thread([this]() { this->recvThreadFunc(); }); + } + INFO(CONN, "IBConnection startSignalForwarding: ", (void*)signalAddr_); +} + +void IBConnection::stopSignalForwarding() { + // Stop the recv thread, then tear down GdrMap and address. + // Order: stop thread → destroy GdrMap → clear address. + if (ibNoAtomic_) { + stopRecvThread_.store(true, std::memory_order_relaxed); + if (recvThread_.joinable()) { + recvThread_.join(); + } + } + if (gdrSignalForwarding_) { + signalGdrMap_.reset(); + } + signalAddr_ = 0; + INFO(CONN, "IBConnection stopSignalForwarding"); +} + void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_WRITE_ENTRY) @@ -220,8 +382,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem auto dstMrInfo = dstTransportInfo.ibMrInfo; auto srcMr = srcTransportInfo.ibMr; - qp_.lock()->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, - /*signaled=*/true); + qp_.lock()->stageSendWrite(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, + /*dstOffset=*/dstOffset, /*signaled=*/true); qp_.lock()->postSend(); INFO(CONN, "IBConnection write: from ", (uint8_t*)srcMr->getBuff() + srcOffset, " to ", @@ -248,12 +410,32 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6 uint64_t oldValue = *src; *src = newValue; - qp_.lock()->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, - /*signaled=*/true); - - qp_.lock()->postSend(); - INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue, - " -> ", newValue); + if (ibNoAtomic_) { + // Signal forwarding: send a 0-byte RDMA WRITE_WITH_IMM with the lower 32 bits of the + // token in imm_data. The receiver reconstructs the full 64-bit value using wrap-around + // detection (tokens are monotonically increasing, so a decrease in the lower 32 bits + // indicates the upper 32 bits incremented by 1). + if (newValue <= oldValue) { + WARN(CONN, "IBConnection signal forwarding: token is not monotonically increasing: ", oldValue, " -> ", newValue); + } else if (newValue - oldValue >= (1ULL << 32)) { + WARN(CONN, + "IBConnection signal forwarding: token increment too large for 32-bit wrap-around detection: ", oldValue, + " -> ", newValue, " (delta ", newValue - oldValue, " >= 2^32)"); + } + unsigned int immData = static_cast(newValue); + qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo, + /*size=*/0, /*wrId=*/0, + /*srcOffset=*/0, /*dstOffset=*/0, + /*signaled=*/true, /*immData=*/immData); + qp_.lock()->postSend(); + INFO(CONN, "IBConnection signal forwarding: value ", oldValue, " -> ", newValue); + } else { + qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, + /*signaled=*/true); + qp_.lock()->postSend(); + INFO(CONN, "IBConnection atomic write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue, + " -> ", newValue); + } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT) NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT, 0, 0, *NpKit::GetCpuTimestamp(), 0); @@ -265,22 +447,27 @@ void IBConnection::flush(int64_t timeoutUsec) { NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_FLUSH_ENTRY, 0, 0, *NpKit::GetCpuTimestamp(), 0); #endif + // Check if the recv thread has already reported an error (e.g., QP entered error state). + if (recvThreadError_.load(std::memory_order_acquire)) { + THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_); + } + Timer timer; - while (qp_.lock()->getNumCqItems()) { - int wcNum = qp_.lock()->pollCq(); + while (qp_.lock()->getNumSendCqItems()) { + int wcNum = qp_.lock()->pollSendCq(); if (wcNum < 0) { - THROW(NET, IbError, errno, "pollCq failed"); + THROW(NET, IbError, errno, "pollSendCq failed"); } else if (timeoutUsec >= 0) { auto elapsed = timer.elapsed(); if (elapsed > timeoutUsec) { - THROW(CONN, Error, ErrorCode::Timeout, "pollCq timed out: waited for ", elapsed / 1e6, " seconds. Expected ", - qp_.lock()->getNumCqItems(), " signals"); + THROW(CONN, Error, ErrorCode::Timeout, "pollSendCq timed out: waited for ", elapsed / 1e6, + " seconds. Expected ", qp_.lock()->getNumSendCqItems(), " signals"); } } for (int i = 0; i < wcNum; ++i) { - int status = qp_.lock()->getWcStatus(i); + int status = qp_.lock()->getSendWcStatus(i); if (status != static_cast(WsStatus::Success)) { - THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getWcStatusString(i)); + THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getSendWcStatusString(i)); } } } diff --git a/src/core/context.cc b/src/core/context.cc index 9bf299d3..aabe71df 100644 --- a/src/core/context.cc +++ b/src/core/context.cc @@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() { } } -void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) { +void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) { CudaDeviceGuard deviceGuard(deviceId_); setStreamIfNeeded(); MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_)); dirty_ = true; } -void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) { +void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) { CudaDeviceGuard deviceGuard(deviceId_); setStreamIfNeeded(); MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_)); @@ -46,9 +46,7 @@ void CudaIpcStream::sync() { } } -Context::Impl::Impl() {} - -IbCtx *Context::Impl::getIbContext(Transport ibTransport) { +IbCtx* Context::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it auto it = ibContexts_.find(ibTransport); if (it == ibContexts_.end()) { @@ -70,7 +68,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique()) {} MSCCLPP_API_CPP Context::~Context() = default; -MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) { +MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) { return RegisteredMemory(std::make_shared(ptr, size, transports, *pimpl_)); } @@ -78,7 +76,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) { return Endpoint(std::make_shared(config, *pimpl_)); } -MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) { +MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) { if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) { throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage); } diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc index 3833fdc4..fe51e348 100644 --- a/src/core/endpoint.cc +++ b/src/core/endpoint.cc @@ -4,9 +4,13 @@ #include "endpoint.hpp" #include +#include #include "api.h" #include "context.hpp" +#include "ib.hpp" +#include "logger.hpp" +#include "registered_memory.hpp" #include "serialization.hpp" #include "socket.h" #include "utils_internal.hpp" @@ -23,9 +27,36 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl) if (config_.maxWriteQueueSize <= 0) { config_.maxWriteQueueSize = config_.ib.maxCqSize; } + + // Determine if we should use no-atomics mode + ibNoAtomic_ = false; + if (config_.ib.mode == EndpointConfig::Ib::Mode::HostNoAtomic) { + ibNoAtomic_ = true; + } else if (config_.ib.mode == EndpointConfig::Ib::Mode::Default) { + // Use environment variable when mode is Default + ibNoAtomic_ = (env()->ibvMode == "host-no-atomic"); + } + + // If mode is Host (or Default resolved to host), check if atomics are supported + if (!ibNoAtomic_) { + IbCtx* ibCtx = contextImpl.getIbContext(config_.transport); + if (!ibCtx->supportsRdmaAtomics()) { + WARN(NET, "IB device ", ibCtx->getDevName(), + " does not support RDMA atomics. Falling back to write-with-immediate mode (HostNoAtomic)."); + ibNoAtomic_ = true; + } + } + + // Resolve GID index: explicit value (>= 0) takes priority, otherwise use env + if (config_.ib.gidIndex < 0) { + config_.ib.gidIndex = env()->ibGidIndex; + } + + int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0; + ibQp_ = contextImpl.getIbContext(config_.transport) ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum, - config_.ib.maxSendWr, 0, config_.ib.maxWrPerSend); + config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_); ibQpInfo_ = ibQp_->getInfo(); } else if (config_.transport == Transport::Ethernet) { // Configuring Ethernet Interfaces @@ -48,6 +79,7 @@ Endpoint::Impl::Impl(const std::vector& serialization) { if (AllIBTransports.has(config_.transport)) { ibLocal_ = false; it = detail::deserialize(it, ibQpInfo_); + it = detail::deserialize(it, ibNoAtomic_); } else if (config_.transport == Transport::Ethernet) { it = detail::deserialize(it, socketAddress_); } @@ -77,6 +109,7 @@ MSCCLPP_API_CPP std::vector Endpoint::serialize() const { detail::serialize(data, pimpl_->pidHash_); if (AllIBTransports.has(pimpl_->config_.transport)) { detail::serialize(data, pimpl_->ibQpInfo_); + detail::serialize(data, pimpl_->ibNoAtomic_); } else if (pimpl_->config_.transport == Transport::Ethernet) { detail::serialize(data, pimpl_->socketAddress_); } diff --git a/src/core/env.cpp b/src/core/env.cpp index 35a31f4c..7a42471b 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -54,18 +54,20 @@ Env::Env() logFile(readEnv("MSCCLPP_LOG_FILE", "")), hcaDevices(readEnv("MSCCLPP_HCA_DEVICES", "")), ibvSo(readEnv("MSCCLPP_IBV_SO", "")), + ibvMode(readEnv("MSCCLPP_IBV_MODE", "host")), hostid(readEnv("MSCCLPP_HOSTID", "")), socketFamily(readEnv("MSCCLPP_SOCKET_FAMILY", "")), socketIfname(readEnv("MSCCLPP_SOCKET_IFNAME", "")), commId(readEnv("MSCCLPP_COMM_ID", "")), - executionPlanDir(readEnv("MSCCLPP_EXECUTION_PLAN_DIR", - readEnv("HOME", "~") + "/.cache/mscclpp_default")), + cacheDir(readEnv("MSCCLPP_CACHE_DIR", readEnv("HOME", "~") + "/.cache/mscclpp")), npkitDumpDir(readEnv("MSCCLPP_NPKIT_DUMP_DIR", "")), cudaIpcUseDefaultStream(readEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)), ncclSharedLibPath(readEnv("MSCCLPP_NCCL_LIB_PATH", "")), forceNcclFallbackOperation(readEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")), - disableChannelCache(readEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", false)), - forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)) {} + ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), + forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)), + forceDisableGdr(readEnv("MSCCLPP_FORCE_DISABLE_GDR", false)), + ibGidIndex(readEnv("MSCCLPP_IB_GID_INDEX", 0)) {} std::shared_ptr env() { static std::shared_ptr globalEnv = std::shared_ptr(new Env()); @@ -81,17 +83,20 @@ std::shared_ptr env() { logEnv("MSCCLPP_LOG_FILE", globalEnv->logFile); logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices); logEnv("MSCCLPP_IBV_SO", globalEnv->ibvSo); + logEnv("MSCCLPP_IBV_MODE", globalEnv->ibvMode); logEnv("MSCCLPP_HOSTID", globalEnv->hostid); logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily); logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname); logEnv("MSCCLPP_COMM_ID", globalEnv->commId); - logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir); + logEnv("MSCCLPP_CACHE_DIR", globalEnv->cacheDir); logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir); logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream); logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath); logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation); - logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache); + logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory); logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); + logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr); + logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex); } return globalEnv; } diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu index 4b1b06bc..28ced77f 100644 --- a/src/core/executor/execution_kernel.cu +++ b/src/core/executor/execution_kernel.cu @@ -32,6 +32,17 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else ); +#endif + break; + case DataType::UINT8: + executionKernel<<>>( + rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, + localMemoryIdBegin, flag +#if defined(ENABLE_NPKIT) + , + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); +#else + ); #endif break; case DataType::FLOAT16: @@ -67,10 +78,16 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo ); #endif break; - case DataType::FP8_E4M3: - case DataType::FP8_E5M2: + case DataType::FLOAT8_E4M3: + case DataType::FLOAT8_E5M2: // FP8 is not supported in CUDA execution kernel. break; + case DataType::FLOAT8_E4M3B15: + // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel. + break; + case DataType::AUTO: + // AUTO is a sentinel resolved before reaching this point; nothing to do. + break; } } diff --git a/src/core/gdr.cc b/src/core/gdr.cc new file mode 100644 index 00000000..22ac15c9 --- /dev/null +++ b/src/core/gdr.cc @@ -0,0 +1,204 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "gdr.hpp" + +#if defined(MSCCLPP_USE_GDRCOPY) + +#include +#include + +#include +#include + +#include "logger.hpp" + +#ifndef GPU_PAGE_SHIFT +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) +#define GPU_PAGE_MASK (~(GPU_PAGE_SIZE - 1)) +#endif + +namespace mscclpp { + +// GdrContext + +class GdrContext { + public: + GdrContext(); + ~GdrContext(); + + GdrContext(const GdrContext&) = delete; + GdrContext& operator=(const GdrContext&) = delete; + + GdrStatus status() const { return status_; } + gdr_t handle() const { return handle_; } + + private: + GdrStatus status_; + gdr_t handle_; +}; + +static std::shared_ptr gdrContext() { + static auto instance = std::make_shared(); + return instance; +} + +GdrStatus gdrStatus() { return gdrContext()->status(); } + +bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; } + +const char* gdrStatusMessage() { + switch (gdrStatus()) { + case GdrStatus::Ok: + return "GDRCopy initialized successfully"; + case GdrStatus::NotBuilt: + return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; + case GdrStatus::Disabled: + return "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable"; + case GdrStatus::DriverMissing: + return "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)"; + case GdrStatus::OpenFailed: + return "gdr_open() failed; GDRCopy driver may be misconfigured"; + default: + return "unknown GDRCopy status"; + } +} + +GdrContext::GdrContext() : status_(GdrStatus::Disabled), handle_(nullptr) { + if (env()->forceDisableGdr) { + INFO(GPU, "GDRCopy disabled via MSCCLPP_FORCE_DISABLE_GDR"); + status_ = GdrStatus::Disabled; + return; + } + + // Auto-detect: check if driver is available + if (access("/dev/gdrdrv", F_OK) != 0) { + INFO(GPU, "GDRCopy driver not detected, disabling GDRCopy"); + status_ = GdrStatus::DriverMissing; + return; + } + + handle_ = gdr_open(); + if (handle_ == nullptr) { + INFO(GPU, "gdr_open() failed, disabling GDRCopy"); + status_ = GdrStatus::OpenFailed; + return; + } + + status_ = GdrStatus::Ok; + INFO(GPU, "GDRCopy initialized successfully"); +} + +GdrContext::~GdrContext() { + if (handle_ != nullptr) { + gdr_close(handle_); + handle_ = nullptr; + } +} + +// GdrMap::Impl — real implementation with GDRCopy + +struct GdrMap::Impl { + std::shared_ptr ctx; + std::shared_ptr gpuMem; + gdr_mh_t mh; + void* barPtr; + uint64_t* hostDstPtr; + size_t mappedSize; +}; + +GdrMap::GdrMap(std::shared_ptr gpuMem, int deviceId) : pimpl_(std::make_unique()) { + pimpl_->ctx = gdrContext(); + pimpl_->gpuMem = std::move(gpuMem); + pimpl_->mh = {}; + pimpl_->barPtr = nullptr; + pimpl_->hostDstPtr = nullptr; + pimpl_->mappedSize = 0; + + // Ensure CUDA device context is active for gdr_pin_buffer + CudaDeviceGuard deviceGuard(deviceId); + + uint64_t gpuAddr = reinterpret_cast(pimpl_->gpuMem.get()); + // Align to GPU page boundary and pin one page around the target address + unsigned long alignedAddr = gpuAddr & GPU_PAGE_MASK; + unsigned long pageOffset = gpuAddr - alignedAddr; + pimpl_->mappedSize = GPU_PAGE_SIZE; + + // Pin the GPU memory for GDRCopy BAR1 mapping. Try GDR_PIN_FLAG_FORCE_PCIE first for optimal + // ordering on platforms that support it (e.g., GB200). Fall back to flags=0 if FORCE_PCIE is + // not supported. Both paths work correctly: CPU writes via atomicStore, GPU reads via + // system-scope acquire. + int ret = + gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, GDR_PIN_FLAG_FORCE_PCIE, &pimpl_->mh); + if (ret != 0) { + ret = gdr_pin_buffer_v2(pimpl_->ctx->handle(), alignedAddr, pimpl_->mappedSize, 0, &pimpl_->mh); + if (ret != 0) { + THROW(GPU, Error, ErrorCode::InternalError, "gdr_pin_buffer_v2 failed (ret=", ret, ") for addr ", (void*)gpuAddr, + ". Ensure the GPU memory is allocated with cudaMalloc (not cuMemCreate/cuMemMap)."); + } + } + + ret = gdr_map(pimpl_->ctx->handle(), pimpl_->mh, &pimpl_->barPtr, pimpl_->mappedSize); + if (ret != 0) { + (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh); + THROW(GPU, Error, ErrorCode::InternalError, "gdr_map failed (ret=", ret, ") for addr ", (void*)gpuAddr); + } + + pimpl_->hostDstPtr = reinterpret_cast(reinterpret_cast(pimpl_->barPtr) + pageOffset); + + INFO(GPU, "GDRCopy mapping established: GPU addr ", (void*)gpuAddr, " -> host ptr ", (const void*)pimpl_->hostDstPtr); +} + +GdrMap::~GdrMap() { + if (pimpl_) { + if (pimpl_->barPtr != nullptr) { + (void)gdr_unmap(pimpl_->ctx->handle(), pimpl_->mh, pimpl_->barPtr, pimpl_->mappedSize); + } + if (pimpl_->hostDstPtr != nullptr) { + (void)gdr_unpin_buffer(pimpl_->ctx->handle(), pimpl_->mh); + } + } +} + +bool GdrMap::valid() const { return pimpl_ && pimpl_->hostDstPtr != nullptr; } + +uint64_t* GdrMap::hostPtr() const { return pimpl_ ? pimpl_->hostDstPtr : nullptr; } + +void GdrMap::copyTo(const void* src, size_t size) { gdr_copy_to_mapping(pimpl_->mh, pimpl_->hostDstPtr, src, size); } + +void GdrMap::copyFrom(void* dst, size_t size) const { + gdr_copy_from_mapping(pimpl_->mh, dst, pimpl_->hostDstPtr, size); +} + +} // namespace mscclpp + +#else // !defined(MSCCLPP_USE_GDRCOPY) + +namespace mscclpp { + +GdrStatus gdrStatus() { return GdrStatus::NotBuilt; } + +bool gdrEnabled() { return false; } + +const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; } + +// GdrMap::Impl — stub (no GDRCopy) + +struct GdrMap::Impl {}; + +GdrMap::GdrMap(std::shared_ptr /*gpuMem*/, int /*deviceId*/) {} + +GdrMap::~GdrMap() = default; + +bool GdrMap::valid() const { return false; } + +uint64_t* GdrMap::hostPtr() const { return nullptr; } + +void GdrMap::copyTo(const void* /*src*/, size_t /*size*/) {} + +void GdrMap::copyFrom(void* /*dst*/, size_t /*size*/) const {} + +} // namespace mscclpp + +#endif // !defined(MSCCLPP_USE_GDRCOPY) diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc index 3c9b41c4..c863ecdd 100644 --- a/src/core/gpu_ipc_mem.cc +++ b/src/core/gpu_ipc_mem.cc @@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) { UnixSocketServer::instance().unregisterFd(handle->posixFd.fd); ::close(handle->posixFd.fd); } + if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) { + if (handle->fabric.allocHandle != 0) { + cuMemRelease(handle->fabric.allocHandle); + } + } delete handle; } } @@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) { auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter); handle->typeFlags = GpuIpcMemHandle::Type::None; handle->posixFd.fd = -1; + handle->fabric.allocHandle = {}; CUdeviceptr basePtr; size_t sz; @@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) { // FABRIC handle if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS) { + MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr)); handle->typeFlags |= GpuIpcMemHandle::Type::Fabric; } @@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b handle->offsetFromBase = 0; handle->typeFlags = GpuIpcMemHandle::Type::None; handle->posixFd.fd = -1; + handle->fabric.allocHandle = {}; // POSIX FD handle int fileDesc; @@ -246,11 +254,18 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) { handle->typeFlags |= GpuIpcMemHandle::Type::Fabric; + handle->fabric.allocHandle = allocHandle; } if (handle->typeFlags == GpuIpcMemHandle::Type::None) { + cuMemRelease(allocHandle); THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created"); } + + // Only release allocHandle if it is not stored in fabric.allocHandle. + if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) { + MSCCLPP_CUTHROW(cuMemRelease(allocHandle)); + } return handle; #else // !(CUDA_NVLS_API_AVAILABLE) THROW(GPU, Error, ErrorCode::InvalidUsage, @@ -270,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle) if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) { if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) == CUDA_SUCCESS) { + // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes. + handle_.fabric.allocHandle = {}; type_ = GpuIpcMemHandle::Type::Fabric; } } @@ -418,41 +435,45 @@ std::shared_ptr GpuIpcMem::mapMulticast([[maybe_unused]] int numDevices, [ // This will block until all devices call cuMulticastAddDevice() MSCCLPP_CUTHROW(cuMulticastBindAddr(allocHandle_, mcOffset, bufferAddr, bufferSize, 0)); + // cuMemMap requires offset to be 0 for multicast handles, so we map the entire range + // [0, mcOffset + bufferSize) and return a pointer at mcPtr + mcOffset. This only consumes + // extra virtual address space for the mcOffset region; no additional physical memory is used. + size_t mapSize = mcOffset + bufferSize; CUdeviceptr mcPtr; - MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, bufferSize, minMcGran, 0U, 0)); - MSCCLPP_CUTHROW(cuMemMap(mcPtr, bufferSize, 0, allocHandle_, 0)); + MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, mapSize, minMcGran, 0U, 0)); + MSCCLPP_CUTHROW(cuMemMap(mcPtr, mapSize, 0, allocHandle_, 0)); CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = deviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, bufferSize, &accessDesc, 1)); + MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, mapSize, &accessDesc, 1)); // Return shared_ptr with custom deleter that unmaps and unbinds CUmemGenericAllocationHandle allocHandle = allocHandle_; - return std::shared_ptr( - reinterpret_cast(mcPtr), [self = shared_from_this(), mcOffset, bufferSize, allocHandle](void* ptr) { - CUresult res; - const char* errStr; + return std::shared_ptr(reinterpret_cast(mcPtr + mcOffset), [self = shared_from_this(), mcPtr, mapSize, + mcOffset, bufferSize, allocHandle](void*) { + CUresult res; + const char* errStr; - res = cuMemUnmap((CUdeviceptr)ptr, bufferSize); - if (res != CUDA_SUCCESS) { - (void)cuGetErrorString(res, &errStr); - WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)ptr, ": ", errStr); - } + res = cuMemUnmap(mcPtr, mapSize); + if (res != CUDA_SUCCESS) { + (void)cuGetErrorString(res, &errStr); + WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)mcPtr, ": ", errStr); + } - res = cuMemAddressFree((CUdeviceptr)ptr, bufferSize); - if (res != CUDA_SUCCESS) { - (void)cuGetErrorString(res, &errStr); - WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)ptr, ": ", errStr); - } + res = cuMemAddressFree(mcPtr, mapSize); + if (res != CUDA_SUCCESS) { + (void)cuGetErrorString(res, &errStr); + WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)mcPtr, ": ", errStr); + } - int deviceId; - CUdevice device; - if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) { - (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize); - } - }); + int deviceId; + CUdevice device; + if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) { + (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize); + } + }); #else // !(CUDA_NVLS_API_AVAILABLE) THROW(GPU, Error, ErrorCode::InvalidUsage, "NVLS is not supported on this device (requires CUDA version >= 12.3 and Linux kernel version >= 5.6.0)"); diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 3aa6aa1c..628d2dcb 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -5,48 +5,7 @@ #include #include -#include "debug.h" - -static inline bool isCudaTeardownError(cudaError_t err) { -#if defined(MSCCLPP_USE_ROCM) - return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice; -#else // !defined(MSCCLPP_USE_ROCM) - return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError || - err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized; -#endif // !defined(MSCCLPP_USE_ROCM) -} - -[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) { - return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED; -} - -#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \ - do { \ - cudaError_t __e = cmd; \ - if (isCudaTeardownError(__e)) { \ - (void)cudaGetLastError(); \ - } else { \ - MSCCLPP_CUDATHROW(__e); \ - } \ - } while (false) - -#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \ - do { \ - CUresult __e = cmd; \ - if (!isCuTeardownError(__e)) { \ - MSCCLPP_CUTHROW(__e); \ - } \ - } while (false) - -#define MSCCLPP_CUTHROW_IGNORE(cmd) \ - do { \ - CUresult __e = cmd; \ - if (__e != CUDA_SUCCESS) { \ - const char* errStr; \ - cuGetErrorString(__e, &errStr); \ - WARN("%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, __e, errStr); \ - } \ - } while (false) +#include "gpu_utils_internal.hpp" namespace mscclpp { diff --git a/src/core/ib.cc b/src/core/ib.cc index 9b86cdf1..557f0426 100644 --- a/src/core/ib.cc +++ b/src/core/ib.cc @@ -3,6 +3,7 @@ #include "ib.hpp" +#include #include #include @@ -20,6 +21,9 @@ #include "context.hpp" #if defined(USE_IBVERBS) #include "ibverbs_wrapper.hpp" +#if defined(MSCCLPP_USE_MLX5DV) +#include "mlx5dv_wrapper.hpp" +#endif // defined(MSCCLPP_USE_MLX5DV) #endif // defined(USE_IBVERBS) #include "logger.hpp" @@ -63,7 +67,7 @@ static inline bool isDmabufSupportedByGpu(int gpuId) { return ret; } -IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff), size_(0) { +IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect) : mr_(nullptr), buff_(buff), size_(0) { if (size == 0) { THROW(NET, Error, ErrorCode::InvalidUsage, "invalid MR size: 0"); } @@ -79,13 +83,50 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff) bool isGpuBuff = (gpuId != -1); if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) { #if !defined(MSCCLPP_USE_ROCM) - int fd; - MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, pages * pageSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + int fd = -1; + size_t rangeSize = pages * pageSize; + // Obtain a DMA-BUF file descriptor for the GPU memory range. On platforms with a CPU-GPU + // bridge that reorders posted writes (e.g., Grace/GB200 NVLink-C2C), the PCIe mapping flag + // routes DMA through the Data Direct engine for correct ordering and higher throughput. + // Fall back to the default (non-PCIe) mapping if the flag is unsupported. +#if (CUDA_VERSION >= 12030) + CUresult cuRes = cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE); + if (cuRes != CUDA_SUCCESS || fd < 0) { + if (fd >= 0) ::close(fd); + fd = -1; + } + bool usedPcieFlag = (fd >= 0); +#endif // CUDA_VERSION >= 12030 + if (fd < 0) { + MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + } + + // Register the DMA-BUF memory region. When Data Direct is available, use the mlx5dv API + // which enables hardware-level Data Direct routing for the MR. Otherwise use standard verbs. size_t offsetInDmaBuf = buffIntPtr % pageSize; - mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC); + int accessFlags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC; + +#if defined(MSCCLPP_USE_MLX5DV) + if (isDataDirect) { + mr_ = MLX5DV::mlx5dv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); + } +#endif + if (mr_ == nullptr) { + mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); + } + + // If MR registration failed with a PCIe-mapped fd, retry with the default mapping. +#if (CUDA_VERSION >= 12030) + if (mr_ == nullptr && usedPcieFlag) { + ::close(fd); + MSCCLPP_CUTHROW(cuMemGetHandleForAddressRange(&fd, addr, rangeSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + mr_ = IBVerbs::ibv_reg_dmabuf_mr(pd, offsetInDmaBuf, size, buffIntPtr, fd, accessFlags); + } +#endif // CUDA_VERSION >= 12030 + ::close(fd); if (mr_ == nullptr) { THROW(NET, IbError, errno, "ibv_reg_dmabuf_mr failed (errno ", errno, ")"); @@ -129,30 +170,47 @@ const void* IbMr::getBuff() const { return buff_; } uint32_t IbMr::getLkey() const { return mr_->lkey; } -IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, - int maxRecvWr, int maxWrPerSend) +IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, + int maxSendWr, int maxRecvWr, int maxWrPerSend, bool noAtomic) : portNum_(portNum), gidIndex_(gidIndex), info_(), qp_(nullptr), - cq_(nullptr), - wcs_(), - wrs_(), - sges_(), - wrn_(0), - numSignaledPostedItems_(0), - numSignaledStagedItems_(0), - maxCqPollNum_(maxCqPollNum), - maxWrPerSend_(maxWrPerSend) { - cq_ = IBVerbs::ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0); - if (cq_ == nullptr) { + sendCq_(nullptr), + recvCq_(nullptr), + sendWcs_(), + recvWcs_(), + sendWrs_(), + sendSges_(), + recvWrs_(), + recvSges_(), + numStagedSend_(0), + numStagedRecv_(0), + numPostedSignaledSend_(0), + numStagedSignaledSend_(0), + maxSendCqPollNum_(maxSendCqPollNum), + maxSendWr_(maxSendWr), + maxWrPerSend_(maxWrPerSend), + maxRecvWr_(maxRecvWr), + noAtomic_(noAtomic) { + sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0); + if (sendCq_ == nullptr) { THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")"); } + // Only create recv CQ if maxRecvWr > 0 + if (maxRecvWr > 0) { + recvCq_ = IBVerbs::ibv_create_cq(ctx, maxRecvWr, nullptr, nullptr, 0); + if (recvCq_ == nullptr) { + THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")"); + } + } + struct ibv_qp_init_attr qpInitAttr = {}; qpInitAttr.sq_sig_all = 0; - qpInitAttr.send_cq = cq_; - qpInitAttr.recv_cq = cq_; + qpInitAttr.send_cq = sendCq_; + // Use separate recv CQ if created, otherwise use the send CQ + qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_; qpInitAttr.qp_type = IBV_QPT_RC; qpInitAttr.cap.max_send_wr = maxSendWr; qpInitAttr.cap.max_recv_wr = maxRecvWr; @@ -173,9 +231,9 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz info_.linkLayer = portAttr.link_layer; info_.qpn = qp->qp_num; info_.mtu = portAttr.active_mtu; - info_.is_grh = (portAttr.flags & IBV_QPF_GRH_REQUIRED); + info_.isGrh = (portAttr.flags & IBV_QPF_GRH_REQUIRED); - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.is_grh) { + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.isGrh) { if (gidIndex_ >= portAttr.gid_tbl_len) { THROW(NET, Error, ErrorCode::InvalidUsage, "invalid GID index ", gidIndex_, " for port ", portNum_, " (max index is ", portAttr.gid_tbl_len - 1, ")"); @@ -194,19 +252,28 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz qpAttr.qp_state = IBV_QPS_INIT; qpAttr.pkey_index = 0; qpAttr.port_num = portNum_; - qpAttr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; + qpAttr.qp_access_flags = noAtomic_ ? IBV_ACCESS_REMOTE_WRITE + : (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC); if (IBVerbs::ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) { THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")"); } qp_ = qp; - wrs_ = std::make_shared>(maxWrPerSend_); - sges_ = std::make_shared>(maxWrPerSend_); - wcs_ = std::make_shared>(maxCqPollNum_); + sendWrs_ = std::make_shared>(maxWrPerSend_); + sendSges_ = std::make_shared>(maxWrPerSend_); + sendWcs_ = std::make_shared>(maxSendCqPollNum_); + recvWcs_ = std::make_shared>(maxRecvWr_); + if (maxRecvWr_ > 0) { + recvWrs_ = std::make_shared>(maxRecvWr_); + recvSges_ = std::make_shared>(maxRecvWr_); + } } IbQp::~IbQp() { IBVerbs::ibv_destroy_qp(qp_); - IBVerbs::ibv_destroy_cq(cq_); + IBVerbs::ibv_destroy_cq(sendCq_); + if (recvCq_ != nullptr) { + IBVerbs::ibv_destroy_cq(recvCq_); + } } void IbQp::rtr(const IbQpInfo& info) { @@ -215,9 +282,9 @@ void IbQp::rtr(const IbQpInfo& info) { qp_attr.path_mtu = static_cast(info.mtu); qp_attr.dest_qp_num = info.qpn; qp_attr.rq_psn = 0; - qp_attr.max_dest_rd_atomic = 1; + qp_attr.max_dest_rd_atomic = noAtomic_ ? 0 : 1; qp_attr.min_rnr_timer = 0x12; - if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.is_grh) { + if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) { qp_attr.ah_attr.is_global = 1; qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info.spn; qp_attr.ah_attr.grh.dgid.global.interface_id = info.iid; @@ -247,7 +314,7 @@ void IbQp::rts() { qp_attr.retry_cnt = 7; qp_attr.rnr_retry = 7; qp_attr.sq_psn = 0; - qp_attr.max_rd_atomic = 1; + qp_attr.max_rd_atomic = noAtomic_ ? 0 : 1; int ret = IBVerbs::ibv_modify_qp( qp_, &qp_attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC); @@ -256,25 +323,25 @@ void IbQp::rts() { } } -IbQp::WrInfo IbQp::getNewWrInfo() { - if (wrn_ >= maxWrPerSend_) { - THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding work requests. limit is ", maxWrPerSend_); +IbQp::SendWrInfo IbQp::getNewSendWrInfo() { + if (numStagedSend_ >= maxWrPerSend_) { + THROW(NET, Error, ErrorCode::InvalidUsage, "too many staged work requests. limit is ", maxWrPerSend_); } - ibv_send_wr* wr_ = &wrs_->data()[wrn_]; - ibv_sge* sge_ = &sges_->data()[wrn_]; + ibv_send_wr* wr_ = &sendWrs_->data()[numStagedSend_]; + ibv_sge* sge_ = &sendSges_->data()[numStagedSend_]; wr_->sg_list = sge_; wr_->num_sge = 1; wr_->next = nullptr; - if (wrn_ > 0) { - (*wrs_)[wrn_ - 1].next = wr_; + if (numStagedSend_ > 0) { + (*sendWrs_)[numStagedSend_ - 1].next = wr_; } - wrn_++; - return IbQp::WrInfo{wr_, sge_}; + numStagedSend_++; + return IbQp::SendWrInfo{wr_, sge_}; } -void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_RDMA_WRITE; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; @@ -283,12 +350,12 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64 wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; wrInfo.sge->length = size; wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + if (signaled) numStagedSignaledSend_++; } -void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, - bool signaled) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, + bool signaled) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; @@ -298,62 +365,149 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u wrInfo.sge->addr = (uint64_t)(mr->getBuff()); wrInfo.sge->length = sizeof(uint64_t); // atomic op is always on uint64_t wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + if (signaled) numStagedSignaledSend_++; } -void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled, unsigned int immData) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; wrInfo.wr->wr.rdma.remote_addr = (uint64_t)(info.addr) + dstOffset; wrInfo.wr->wr.rdma.rkey = info.rkey; - wrInfo.wr->imm_data = immData; - wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; - wrInfo.sge->length = size; - wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + wrInfo.wr->imm_data = htonl(immData); + if (mr != nullptr) { + wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; + wrInfo.sge->length = size; + wrInfo.sge->lkey = mr->getLkey(); + } else { + // 0-byte write-with-imm: no source buffer needed + wrInfo.sge->addr = 0; + wrInfo.sge->length = 0; + wrInfo.sge->lkey = 0; + } + if (signaled) numStagedSignaledSend_++; } void IbQp::postSend() { - if (wrn_ == 0) { + if (numStagedSend_ == 0) { return; } struct ibv_send_wr* bad_wr; - int err = IBVerbs::ibv_post_send(qp_, wrs_->data(), &bad_wr); + int err = IBVerbs::ibv_post_send(qp_, sendWrs_->data(), &bad_wr); if (err != 0) { THROW(NET, IbError, err, "ibv_post_send failed (errno ", err, ")"); } - wrn_ = 0; - numSignaledPostedItems_ += numSignaledStagedItems_; - numSignaledStagedItems_ = 0; - if (numSignaledPostedItems_ + 4 > cq_->cqe) { - WARN(NET, "IB: CQ is almost full ( ", numSignaledPostedItems_, " / ", cq_->cqe, + numStagedSend_ = 0; + numPostedSignaledSend_ += numStagedSignaledSend_; + numStagedSignaledSend_ = 0; + if (numPostedSignaledSend_ + 4 > sendCq_->cqe) { + WARN(NET, "IB: CQ is almost full ( ", numPostedSignaledSend_, " / ", sendCq_->cqe, " ). The connection needs to be flushed to prevent timeout errors."); } } -int IbQp::pollCq() { - int wcNum = IBVerbs::ibv_poll_cq(cq_, maxCqPollNum_, wcs_->data()); +IbQp::RecvWrInfo IbQp::getNewRecvWrInfo() { + if (numStagedRecv_ >= maxRecvWr_) { + THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding recv work requests. limit is ", maxRecvWr_); + } + ibv_recv_wr* wr = &recvWrs_->data()[numStagedRecv_]; + ibv_sge* sge = &recvSges_->data()[numStagedRecv_]; + wr->next = nullptr; + if (numStagedRecv_ > 0) { + (*recvWrs_)[numStagedRecv_ - 1].next = wr; + } + numStagedRecv_++; + return IbQp::RecvWrInfo{wr, sge}; +} + +void IbQp::stageRecv(uint64_t wrId) { + auto wrInfo = this->getNewRecvWrInfo(); + // For RDMA write-with-imm, data goes to remote_addr specified by sender. + // We only need the recv WR to get the completion notification with imm_data. + wrInfo.wr->wr_id = wrId; + wrInfo.wr->sg_list = nullptr; + wrInfo.wr->num_sge = 0; +} + +void IbQp::stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset) { + auto wrInfo = this->getNewRecvWrInfo(); + wrInfo.wr->wr_id = wrId; + wrInfo.sge->addr = reinterpret_cast(mr->getBuff()) + offset; + wrInfo.sge->length = size; + wrInfo.sge->lkey = mr->getLkey(); + wrInfo.wr->sg_list = wrInfo.sge; + wrInfo.wr->num_sge = 1; +} + +void IbQp::postRecv() { + if (numStagedRecv_ == 0) return; + struct ibv_recv_wr* bad_wr; + int err = IBVerbs::ibv_post_recv(qp_, recvWrs_->data(), &bad_wr); + if (err != 0) { + THROW(NET, IbError, err, "ibv_post_recv failed (errno ", err, ")"); + } + numStagedRecv_ = 0; +} + +int IbQp::pollSendCq() { + int wcNum = IBVerbs::ibv_poll_cq(sendCq_, maxSendCqPollNum_, sendWcs_->data()); if (wcNum > 0) { - numSignaledPostedItems_ -= wcNum; + numPostedSignaledSend_ -= wcNum; } return wcNum; } -int IbQp::getWcStatus(int idx) const { return (*wcs_)[idx].status; } +int IbQp::pollRecvCq() { + int wcNum = IBVerbs::ibv_poll_cq(recvCq_, maxRecvWr_, recvWcs_->data()); + return wcNum; +} -std::string IbQp::getWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*wcs_)[idx].status); } +int IbQp::getSendWcStatus(int idx) const { return (*sendWcs_)[idx].status; } -int IbQp::getNumCqItems() const { return numSignaledPostedItems_; } +std::string IbQp::getSendWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*sendWcs_)[idx].status); } -IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr) { +int IbQp::getNumSendCqItems() const { return numPostedSignaledSend_; } + +int IbQp::getRecvWcStatus(int idx) const { return (*recvWcs_)[idx].status; } + +std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*recvWcs_)[idx].status); } + +unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); } + +IbCtx::IbCtx(const std::string& devName) + : devName_(devName), + ctx_(nullptr), + pd_(nullptr), + supportsRdmaAtomics_(false), + isMlx5_(false), + isDataDirect_(false), + isVF_(false) { int num; struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { if (std::string(devices[i]->name) == devName_) { ctx_ = IBVerbs::ibv_open_device(devices[i]); + + // Detect if this IB device is a Virtual Function (VF). + // VFs have a 'physfn' sysfs symlink pointing to their parent PF; PFs do not. + { + std::string physfnPath = "/sys/class/infiniband/" + devName_ + "/device/physfn"; + isVF_ = (access(physfnPath.c_str(), F_OK) == 0); + if (isVF_) { + INFO(NET, "IB device ", devName_, " is a Virtual Function (Data Direct ordering available)"); + } + } + +#if defined(MSCCLPP_USE_MLX5DV) + if (MLX5DV::isAvailable()) { + isMlx5_ = MLX5DV::mlx5dv_is_supported(devices[i]); + if (isMlx5_) { + INFO(NET, "IB device ", devName_, " supports mlx5 Direct Verbs"); + } + } +#endif // defined(MSCCLPP_USE_MLX5DV) break; } } @@ -365,6 +519,26 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_ if (pd_ == nullptr) { THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")"); } + + // Detect Data Direct support via mlx5dv_get_data_direct_sysfs_path +#if defined(MSCCLPP_USE_MLX5DV) + if (isMlx5_ && MLX5DV::isAvailable()) { + char sysfsPath[256]; + int ret = MLX5DV::mlx5dv_get_data_direct_sysfs_path(ctx_, sysfsPath, sizeof(sysfsPath)); + if (ret == 0) { + isDataDirect_ = true; + INFO(NET, "IB device ", devName_, " supports Data Direct (sysfs: ", sysfsPath, ")"); + } else { + INFO(NET, "IB device ", devName_, " does not support Data Direct"); + } + } +#endif // defined(MSCCLPP_USE_MLX5DV) + + // Query and cache RDMA atomics capability + struct ibv_device_attr attr = {}; + if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) { + supportsRdmaAtomics_ = (attr.atomic_cap == IBV_ATOMIC_HCA || attr.atomic_cap == IBV_ATOMIC_GLOB); + } } IbCtx::~IbCtx() { @@ -419,8 +593,8 @@ int IbCtx::getAnyUsablePort(int gidIndex) const { return -1; } -std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, - int maxRecvWr, int maxWrPerSend) { +std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, + int maxRecvWr, int maxWrPerSend, bool noAtomic) { if (port == -1) { port = this->getAnyUsablePort(gidIndex); if (port == -1) { @@ -429,14 +603,22 @@ std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxCqSize, int } else if (!this->isPortUsable(port, gidIndex)) { THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port); } - return std::shared_ptr( - new IbQp(ctx_, pd_, port, gidIndex, maxCqSize, maxCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend)); + return std::shared_ptr(new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, + maxRecvWr, maxWrPerSend, noAtomic)); } std::unique_ptr IbCtx::registerMr(void* buff, std::size_t size) { - return std::unique_ptr(new IbMr(pd_, buff, size)); + return std::unique_ptr(new IbMr(pd_, buff, size, isDataDirect_)); } +bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; } + +bool IbCtx::isMlx5() const { return isMlx5_; } + +bool IbCtx::isDataDirect() const { return isDataDirect_; } + +bool IbCtx::isVirtualFunction() const { return isVF_; } + MSCCLPP_API_CPP int getIBDeviceCount() { int num; IBVerbs::ibv_get_device_list(&num); @@ -542,6 +724,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport) { return ""; } MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string&) { return Transport::Unknown; } +IbMr::~IbMr() {} +IbMrInfo IbMr::getInfo() const { return IbMrInfo(); } +const void* IbMr::getBuff() const { return nullptr; } +uint32_t IbMr::getLkey() const { return 0; } + +IbQp::~IbQp() {} +void IbQp::rtr(const IbQpInfo& /*info*/) {} +void IbQp::rts() {} +void IbQp::stageSendWrite(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/) {} +void IbQp::stageSendAtomicAdd(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint64_t /*wrId*/, uint64_t /*dstOffset*/, + uint64_t /*addVal*/, bool /*signaled*/) {} +void IbQp::stageSendWriteWithImm(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/, + unsigned int /*immData*/) {} +void IbQp::postSend() {} +void IbQp::stageRecv(uint64_t /*wrId*/) {} +void IbQp::stageRecv(const IbMr* /*mr*/, uint64_t /*wrId*/, uint32_t /*size*/, uint64_t /*offset*/) {} +void IbQp::postRecv() {} +int IbQp::pollSendCq() { return 0; } +int IbQp::pollRecvCq() { return 0; } +int IbQp::getSendWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getSendWcStatusString(int /*idx*/) const { return ""; } +int IbQp::getNumSendCqItems() const { return 0; } +int IbQp::getRecvWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getRecvWcStatusString(int /*idx*/) const { return ""; } +unsigned int IbQp::getRecvWcImmData(int /*idx*/) const { return 0; } + #endif // !defined(USE_IBVERBS) } // namespace mscclpp diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp index c9d81d41..22a9930f 100644 --- a/src/core/include/connection.hpp +++ b/src/core/include/connection.hpp @@ -4,11 +4,19 @@ #ifndef MSCCLPP_CONNECTION_HPP_ #define MSCCLPP_CONNECTION_HPP_ +#include +#include #include #include +#include +#include +#include +#include #include "communicator.hpp" #include "context.hpp" +#include "endpoint.hpp" +#include "gdr.hpp" #include "ib.hpp" #include "registered_memory.hpp" #include "socket.h" @@ -29,6 +37,19 @@ class BaseConnection { virtual void flush(int64_t timeoutUsec = -1) = 0; + /// Start signal forwarding to the given memory address. + /// Called by the semaphore to specify where incoming signals should be written. + /// @param mem Shared pointer to the GPU memory for the signal token. + virtual void startSignalForwarding(std::shared_ptr /*mem*/) {} + + /// Stop signal forwarding and release associated resources. + virtual void stopSignalForwarding() {} + + /// Whether this connection uses signal forwarding (e.g., IB host-no-atomic mode). + /// When true, the semaphore must allocate a separate inboundToken_ for the recv thread to write to. + /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics). + virtual bool isSignalForwarding() const { return false; } + virtual Transport transport() const = 0; virtual Transport remoteTransport() const = 0; @@ -39,6 +60,8 @@ class BaseConnection { int getMaxWriteQueueSize() const; + static std::shared_ptr& getImpl(Connection& conn) { return conn.impl_; } + protected: friend class Context; friend class CudaIpcConnection; @@ -77,12 +100,45 @@ class IBConnection : public BaseConnection { Transport transport_; Transport remoteTransport_; std::weak_ptr qp_; - std::unique_ptr dummyAtomicSource_; // not used anywhere but IB needs a source - RegisteredMemory dummyAtomicSourceMem_; - mscclpp::TransportInfo dstTransportInfo_; + std::unique_ptr atomicSrc_; + RegisteredMemory atomicSrcMem_; + mscclpp::TransportInfo atomicSrcTransportInfo_; + + // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal + // instead of atomic operations, with a host thread forwarding to GPU for memory consistency. + bool ibNoAtomic_; + bool gdrSignalForwarding_; // ibNoAtomic_ && gdrEnabled() — decided once at construction + std::thread recvThread_; + std::atomic stopRecvThread_; + std::atomic recvThreadError_; // Set by recv thread on fatal error + std::string recvThreadErrorMsg_; // Error message from recv thread (written before recvThreadError_ is set) + int localGpuDeviceId_; // Local GPU device ID for CUDA context and GDR mapping + + // Signal forwarding design (HostNoAtomic mode): + // - Sender: 0-byte RDMA WRITE_WITH_IMM carrying the lower 32 bits of the token in imm_data. + // - Receiver: CPU recv thread polls recv CQ for WRITE_WITH_IMM completions (CQE), reads + // the lower 32 bits from imm_data, reconstructs the full 64-bit token using wrap-around + // detection (monotonically increasing tokens: if lower 32 bits decrease, the upper half + // incremented), then writes it to signalAddr_ via atomicStore through GDRCopy BAR1. + uint64_t signalAddr_; + + std::unique_ptr signalGdrMap_; + + void recvThreadFunc(); public: IBConnection(std::shared_ptr context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint); + ~IBConnection(); + + /// Start signal forwarding to the given memory address. + /// Must be called before the remote sends any updateAndSync in HostNoAtomic mode. + /// @param mem Shared pointer to the GPU memory for the signal token. + void startSignalForwarding(std::shared_ptr mem) override; + + /// Stop signal forwarding and release associated resources. + void stopSignalForwarding() override; + + bool isSignalForwarding() const override; Transport transport() const override; diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp index b53a2662..42d03db1 100644 --- a/src/core/include/context.hpp +++ b/src/core/include/context.hpp @@ -24,9 +24,9 @@ class CudaIpcStream { public: CudaIpcStream(int deviceId); - void memcpyD2D(void *dst, const void *src, size_t nbytes); + void memcpyD2D(void* dst, const void* src, size_t nbytes); - void memcpyH2D(void *dst, const void *src, size_t nbytes); + void memcpyH2D(void* dst, const void* src, size_t nbytes); void sync(); @@ -42,9 +42,7 @@ struct Context::Impl { std::shared_ptr tokenPool_; const size_t maxNumTokens_ = 1 << 15; // 32K tokens - Impl(); - - IbCtx *getIbContext(Transport ibTransport); + IbCtx* getIbContext(Transport ibTransport); std::shared_ptr getToken(); }; diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp index a3a5ad41..363faab1 100644 --- a/src/core/include/endpoint.hpp +++ b/src/core/include/endpoint.hpp @@ -4,6 +4,7 @@ #ifndef MSCCLPP_ENDPOINT_HPP_ #define MSCCLPP_ENDPOINT_HPP_ +#include #include #include @@ -24,6 +25,7 @@ struct Endpoint::Impl { // The following are only used for IB and are undefined for other transports. bool ibLocal_; + bool ibNoAtomic_; std::shared_ptr ibQp_; IbQpInfo ibQpInfo_; diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index fb6c436f..87b88888 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -17,356 +17,7 @@ #include #include "execution_common.hpp" - -namespace { -#if defined(MSCCLPP_DEVICE_COMPILE) -template -MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) { - return a + b; -} - -template <> -MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __bfloat16 add_elements(__bfloat16 a, __bfloat16 b) { - return __hadd(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { - return __hadd2(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -// FP8 E4M3 addition using __hadd (single element) -template <> -MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // Optimized assembly for gfx942 - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false); -#else - return __fp8_e4m3(__hadd(__half(a), __half(b))); -#endif -} - -// FP8 E5M2 addition using __hadd (single element) - must come before helper functions -template <> -MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // Optimized assembly for gfx942 (bfloat8) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false); -#else - return __fp8_e5m2(__hadd(__half(a), __half(b))); -#endif -} - -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) -// HIP gfx942 platform: Helper functions for vectorized FP8 operations -// We use separate function names because __fp8x2_e4m3 and __fp8x2_e5m2 are both uint16_t - -// E4M3 vectorized addition for 2 elements -MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) { - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false); -} - -// E4M3 vectorized addition for 4 elements -MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e4m3(uint32_t a, uint32_t b) { - uint16_t a_low = a & 0xFFFF; - uint16_t a_high = (a >> 16) & 0xFFFF; - uint16_t b_low = b & 0xFFFF; - uint16_t b_high = (b >> 16) & 0xFFFF; - uint16_t result_low = add_fp8x2_e4m3(a_low, b_low); - uint16_t result_high = add_fp8x2_e4m3(a_high, b_high); - return (static_cast(result_high) << 16) | result_low; -} - -// E5M2 vectorized addition for 2 elements -MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) { - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false); -} - -// E5M2 vectorized addition for 4 elements -MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e5m2(uint32_t a, uint32_t b) { - uint16_t a_low = a & 0xFFFF; - uint16_t a_high = (a >> 16) & 0xFFFF; - uint16_t b_low = b & 0xFFFF; - uint16_t b_high = (b >> 16) & 0xFFFF; - uint16_t result_low = add_fp8x2_e5m2(a_low, b_low); - uint16_t result_high = add_fp8x2_e5m2(a_high, b_high); - return (static_cast(result_high) << 16) | result_low; -} -#endif - -#if !defined(MSCCLPP_DEVICE_HIP) -// CUDA platform: Template specializations for vectorized FP8 operations - -// FP8 E4M3 vectorized addition using __hadd2 for 2 elements (CUDA only) -template <> -MSCCLPP_DEVICE_INLINE __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { - return __fp8x2_e4m3(__hadd2(__half2(a), __half2(b))); -} - -// FP8 E4M3 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e4m3) -template <> -MSCCLPP_DEVICE_INLINE __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a); - __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b); - - __fp8x2_e4m3 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e4m3*>(result); -} - -// FP8 E5M2 vectorized addition for 2 elements (CUDA only) -template <> -MSCCLPP_DEVICE_INLINE __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - return __fp8x2_e5m2(__hadd2(__half2(a), __half2(b))); -} - -// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2) -template <> -MSCCLPP_DEVICE_INLINE __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a); - __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b); - - __fp8x2_e5m2 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e5m2*>(result); -} -#endif -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__bfloat16>(int4 a, int4 b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - int4 ret; - ret.w = add_fp8x4_e4m3(a.w, b.w); - ret.x = add_fp8x4_e4m3(a.x, b.x); - ret.y = add_fp8x4_e4m3(a.y, b.y); - ret.z = add_fp8x4_e4m3(a.z, b.z); - return ret; -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e5m2>(int4 a, int4 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - int4 ret; - ret.w = add_fp8x4_e5m2(a.w, b.w); - ret.x = add_fp8x4_e5m2(a.x, b.x); - ret.y = add_fp8x4_e5m2(a.y, b.y); - ret.z = add_fp8x4_e5m2(a.z, b.z); - return ret; -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__bfloat16>(uint2 a, uint2 b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint2 a, uint2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - uint2 ret; - ret.x = add_fp8x4_e4m3(a.x, b.x); - ret.y = add_fp8x4_e4m3(a.y, b.y); - return ret; -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e5m2>(uint2 a, uint2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - uint2 ret; - ret.x = add_fp8x4_e5m2(a.x, b.x); - ret.y = add_fp8x4_e5m2(a.y, b.y); - return ret; -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__bfloat16>(int a, int b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a, int b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e4m3(a, b); -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e5m2>(int a, int b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e5m2(a, b); -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e4m3(a, b); -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e5m2>(uint32_t a, uint32_t b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e5m2(a, b); -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -#endif // MSCCLPP_DEVICE_COMPILE - -} // namespace - +#include "reduce_kernel.hpp" namespace mscclpp { #if defined(MSCCLPP_DEVICE_COMPILE) @@ -534,7 +185,7 @@ MSCCLPP_DEVICE_INLINE void handlePut(const Operation& op, void* input, void* out } } -template +template MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input, void* output, void* scratch, uint32_t offset, uint32_t unitSize) { const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); @@ -559,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input sizeof(int4); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]); val = mscclpp::read(remoteMemory, srcOffset + idx); - tmp = add_vectors(tmp, val); + tmp = calVector(tmp, val); } output4[outputOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -587,7 +238,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input getOffset(memoryChannelBufferTypes_[op.inputBufferRefs[index + 1].id], offset)) / sizeof(T); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]); - tmp = add_elements(tmp, mscclpp::read(remoteMemory, srcOffset + idx)); + tmp = tmp + mscclpp::read(remoteMemory, srcOffset + idx); } static_cast(output)[idx] = tmp; if constexpr (SendToRemote) { @@ -647,11 +298,11 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat ChannelType chType = op.channelType; if (chType == ChannelType::MEMORY) { size_t nPackets = size / sizeof(PacketPayload); + PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1)); for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) { + PacketPayload data = pkts[pktIdx].read(flag_); + PacketType pkt(data, flag_); for (uint32_t idx = 0; idx < nOutput; ++idx) { - PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1)); - PacketPayload data = pkts[pktIdx].read(flag_); - PacketType pkt(data, flag_); size_t offset = (scratchOffset_ + (dstOffsets[idx] << 1)) / sizeof(PacketType); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.outputBufferRefs[idx].id]); mscclpp::write(remoteMemory, offset + pktIdx, pkt); @@ -661,10 +312,8 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat // Ensuring Data Is Ready size_t nPackets = size / sizeof(PacketPayload); for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) { - for (uint32_t idx = 0; idx < nOutput; ++idx) { - PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1)); - pkts[pktIdx].read(flag_); - } + PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1)); + pkts[pktIdx].read(flag_); } __syncthreads(); @@ -674,14 +323,14 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat return; } uint32_t dstOffset = (dstOffsets[chIdx] << 1) + scratchOffset_; - uint32_t srcOffset = (srcOffsets[chIdx] << 1) + scratchOffset_; + uint32_t srcOffset = (srcOffsets[0] << 1) + scratchOffset_; MemoryId dstMemoryId = portChannelBufferIds_[op.outputBufferRefs[chIdx].id]; portChannels_[channelIndexes[chIdx]].put( dstMemoryId, dstOffset, static_cast(BufferType::SCRATCH) + localMemoryIdBegin_, srcOffset, size << 1); } } -template +template MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* input, void* output, void* scratch) { uint32_t size = op.inputBufferSizes[0]; const uint32_t nSrcs = op.nInputs - 1; @@ -704,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = add_vectors(data, val); + data = calVector(data, val); } - data = add_vectors(data, srcPacketPayload[idx]); + data = calVector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; if constexpr (SendToRemote) { @@ -720,7 +369,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in } } -template +template MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void* input, void* output, void* scratch) { uint32_t size = op.inputBufferSizes[0]; const uint32_t nSrcs = op.nInputs - 1; @@ -745,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = add_vectors(data, val); + data = calVector(data, val); } - data = add_vectors(data, srcPacketPayload[idx]); + data = calVector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; PacketType* dst_val = &dstPkt[idx]; dst_val->write(data, flag_); @@ -790,7 +439,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPackets(const Operation& op, void* input, v mscclpp::copyToPackets(dst, src, size, threadIdx.x, blockDim.x, flag_); } -template +template MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, void* output, void* scratch, uint32_t offset, uint32_t unitSize) { const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); @@ -815,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo size_t buffOffset = (inputOffsets[index] + getOffset(outputBufferRefs[index].type, offset)) / sizeof(int4); int4 val = buff4[buffOffset + idx]; - tmp = add_vectors(tmp, val); + tmp = calVector(tmp, val); } dst4[dstOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -840,7 +489,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo T* buff = static_cast(getBuffer(input, output, scratch, inputBufferRefs[index].type)); uint32_t buffOffset = (inputOffsets[index] + getOffset(inputBufferRefs[index].type, offset)) / sizeof(T); - tmp = add_elements(tmp, buff[buffOffset + idx]); + tmp = tmp + buff[buffOffset + idx]; } dst[idx] = tmp; if constexpr (SendToRemote) { @@ -872,51 +521,56 @@ MSCCLPP_DEVICE_INLINE void handleCopy(const Operation& op, void* input, void* ou #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 template MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint32_t offset, uint32_t unitSize) { - static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes"); - const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); - if (size <= 0) { + if constexpr (std::is_same_v) { + assert(false && "MULTI_LOAD_REDUCE_STORE is not supported for uint8_t data type"); return; - } - const uint32_t srcOffset = op.inputOffsets[0] + getOffset(op.nvlsInputBufferType, offset); - const uint32_t dstOffset = op.outputOffsets[0] + getOffset(op.nvlsOutputBufferType, offset); - assert(size % sizeof(T) == 0); - assert(srcOffset % sizeof(T) == 0); - assert(dstOffset % sizeof(T) == 0); - - T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr; - T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr; - if constexpr (std::is_same_v || std::is_same_v) { - const size_t nElem = size / sizeof(T); - const size_t srcOffsetElem = srcOffset / sizeof(T); - const size_t dstOffsetElem = dstOffset / sizeof(T); - VectorType* srcElem = reinterpret_cast*>(src + srcOffsetElem); - VectorType* dstElem = reinterpret_cast*>(dst + dstOffsetElem); - for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) { - auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx); - SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx); - } } else { - // handle data in 16-byte unit - using Type16 = typename mscclpp::VectorType; - const size_t nType16 = size / sizeof(Type16); - const size_t srcOffset16 = srcOffset / sizeof(Type16); - const size_t dstOffset16 = dstOffset / sizeof(Type16); - Type16* src16 = reinterpret_cast(src) + srcOffset16; - Type16* dst16 = reinterpret_cast(dst) + dstOffset16; - for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) { - Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx); - SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx); + static_assert(sizeof(T) <= 8, "Only support type with size <= 8 bytes"); + const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); + if (size <= 0) { + return; } - // handle rest of data - constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4; - using TypeRest = typename mscclpp::VectorType; - const size_t processed = nType16 * sizeof(Type16); - const size_t nRest = (size - processed) / sizeof(TypeRest); - TypeRest* srcR = reinterpret_cast(src + srcOffset + processed); - TypeRest* dstR = reinterpret_cast(dst + dstOffset + processed); - for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) { - TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx); - SwitchChannelDeviceHandle::multimemStore(val, dstR + idx); + const uint32_t srcOffset = op.inputOffsets[0] + getOffset(op.nvlsInputBufferType, offset); + const uint32_t dstOffset = op.outputOffsets[0] + getOffset(op.nvlsOutputBufferType, offset); + assert(size % sizeof(T) == 0); + assert(srcOffset % sizeof(T) == 0); + assert(dstOffset % sizeof(T) == 0); + + T* src = (T*)nvlsChannels_[op.nvlsInputIndex].mcPtr; + T* dst = (T*)nvlsChannels_[op.nvlsOutputIndex].mcPtr; + if constexpr (std::is_same_v || std::is_same_v) { + const size_t nElem = size / sizeof(T); + const size_t srcOffsetElem = srcOffset / sizeof(T); + const size_t dstOffsetElem = dstOffset / sizeof(T); + VectorType* srcElem = reinterpret_cast*>(src + srcOffsetElem); + VectorType* dstElem = reinterpret_cast*>(dst + dstOffsetElem); + for (size_t idx = threadIdx.x; idx < nElem; idx += blockDim.x) { + auto val = SwitchChannelDeviceHandle::multimemLoadReduce(srcElem + idx); + SwitchChannelDeviceHandle::multimemStore(val, dstElem + idx); + } + } else { + // handle data in 16-byte unit + using Type16 = mscclpp::VectorType; + const size_t nType16 = size / sizeof(Type16); + const size_t srcOffset16 = srcOffset / sizeof(Type16); + const size_t dstOffset16 = dstOffset / sizeof(Type16); + Type16* src16 = reinterpret_cast(src) + srcOffset16; + Type16* dst16 = reinterpret_cast(dst) + dstOffset16; + for (size_t idx = threadIdx.x; idx < nType16; idx += blockDim.x) { + Type16 val = SwitchChannelDeviceHandle::multimemLoadReduce(src16 + idx); + SwitchChannelDeviceHandle::multimemStore(val, dst16 + idx); + } + // handle rest of data + constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4; + using TypeRest = mscclpp::VectorType; + const size_t processed = nType16 * sizeof(Type16); + const size_t nRest = (size - processed) / sizeof(TypeRest); + TypeRest* srcR = reinterpret_cast(src + srcOffset + processed); + TypeRest* dstR = reinterpret_cast(dst + dstOffset + processed); + for (size_t idx = threadIdx.x; idx < nRest; idx += blockDim.x) { + TypeRest val = SwitchChannelDeviceHandle::multimemLoadReduce(srcR + idx); + SwitchChannelDeviceHandle::multimemStore(val, dstR + idx); + } } } } @@ -1222,7 +876,7 @@ class ExecutionKernel { #endif break; #if defined(__FP8_TYPES_EXIST__) - case DataType::FP8_E4M3: + case DataType::FLOAT8_E4M3: executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<>>( rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, localMemoryIdBegin, flag @@ -1233,7 +887,7 @@ class ExecutionKernel { ); #endif break; - case DataType::FP8_E5M2: + case DataType::FLOAT8_E5M2: executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<>>( rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, localMemoryIdBegin, flag @@ -1245,6 +899,32 @@ class ExecutionKernel { #endif break; #endif // __FP8_TYPES_EXIST__ + case DataType::FLOAT8_E4M3B15: + executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<>>( + rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize, + plan, semaphores, localMemoryIdBegin, flag +#if defined(ENABLE_NPKIT) + , + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); +#else + ); +#endif + break; + case DataType::UINT8: + executionKernel<<>>( + rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, + localMemoryIdBegin, flag +#if defined(ENABLE_NPKIT) + , + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); +#else + ); +#endif + break; + case DataType::AUTO: + // AUTO is a sentinel that must be resolved before reaching this point. + assert(false && "DataType::AUTO must be resolved before kernel launch"); + break; } } #else // !defined(MSCCLPP_DEVICE_HIP) diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp new file mode 100644 index 00000000..e0c7f006 --- /dev/null +++ b/src/core/include/gdr.hpp @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_GDR_HPP_ +#define MSCCLPP_GDR_HPP_ + +#include +#include +#include + +namespace mscclpp { + +enum class GdrStatus { + Ok, // GDRCopy initialized successfully + NotBuilt, // Built without MSCCLPP_USE_GDRCOPY + Disabled, // Disabled via MSCCLPP_FORCE_DISABLE_GDR + DriverMissing, // /dev/gdrdrv not found + OpenFailed, // gdr_open() failed +}; + +/// Return the detailed status of the global GDRCopy context. +GdrStatus gdrStatus(); + +/// Whether the global GDRCopy context is enabled (shorthand for gdrStatus() == GdrStatus::Ok). +bool gdrEnabled(); + +/// Return a human-readable error message for the current GDRCopy status. +const char* gdrStatusMessage(); + +/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address. +/// When GDRCopy is not available, all operations are no-ops and valid() returns false. +class GdrMap { + public: + /// Pin and map a GPU address for direct host-side access. + /// @param gpuMem Shared pointer to the GPU memory (e.g. from gpuCallocShared). + /// @param deviceId The CUDA device ID for setting context. + GdrMap(std::shared_ptr gpuMem, int deviceId); + ~GdrMap(); + + GdrMap(const GdrMap&) = delete; + GdrMap& operator=(const GdrMap&) = delete; + + /// Whether the mapping was established successfully. + bool valid() const; + + /// Return the BAR1-mapped host pointer to the GPU location. + uint64_t* hostPtr() const; + + /// Copy data from host memory to the mapped GPU location. + void copyTo(const void* src, size_t size); + + /// Copy data from the mapped GPU location to host memory. + void copyFrom(void* dst, size_t size) const; + + private: + struct Impl; + std::unique_ptr pimpl_; +}; + +} // namespace mscclpp + +#endif // MSCCLPP_GDR_HPP_ diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp index 98fa47f2..f66545c2 100644 --- a/src/core/include/gpu_ipc_mem.hpp +++ b/src/core/include/gpu_ipc_mem.hpp @@ -44,9 +44,10 @@ struct GpuIpcMemHandle { struct { char handle[64]; + CUmemGenericAllocationHandle allocHandle; } fabric; - static void deleter(GpuIpcMemHandle *handle); + static void deleter(GpuIpcMemHandle* handle); // We make GpuIpcMemHandle trivially copyable for easy serialization, // and thus it cannot have explicit destructors. @@ -61,7 +62,7 @@ struct GpuIpcMemHandle { using Base::Base; // Allow implicit conversion from Base - UniquePtr(Base &&other) : Base(std::move(other)) {} + UniquePtr(Base&& other) : Base(std::move(other)) {} }; static UniquePtr create(const CUdeviceptr ptr); @@ -70,7 +71,7 @@ struct GpuIpcMemHandle { using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr; -std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags); +std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags); static_assert(std::is_trivially_copyable_v); @@ -82,7 +83,7 @@ class GpuIpcMem : public std::enable_shared_from_this { /// Create a GpuIpcMem instance from a GpuIpcMemHandle. /// @param handle The handle to import. /// @return A shared_ptr to the created GpuIpcMem instance. - static std::shared_ptr create(const GpuIpcMemHandle &handle); + static std::shared_ptr create(const GpuIpcMemHandle& handle); ~GpuIpcMem(); @@ -102,7 +103,7 @@ class GpuIpcMem : public std::enable_shared_from_this { std::shared_ptr mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize); private: - GpuIpcMem(const GpuIpcMemHandle &handle); + GpuIpcMem(const GpuIpcMemHandle& handle); GpuIpcMemHandle handle_; CUmemGenericAllocationHandle allocHandle_; diff --git a/src/core/include/gpu_utils_internal.hpp b/src/core/include/gpu_utils_internal.hpp new file mode 100644 index 00000000..a7cea86b --- /dev/null +++ b/src/core/include/gpu_utils_internal.hpp @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_GPU_UTILS_INTERNAL_HPP_ +#define MSCCLPP_GPU_UTILS_INTERNAL_HPP_ + +#include +#include + +#include "logger.hpp" + +namespace mscclpp { + +/// Check if a CUDA error indicates runtime teardown (safe to ignore in destructors). +inline bool isCudaTeardownError(cudaError_t err) { +#if defined(MSCCLPP_USE_ROCM) + return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice; +#else // !defined(MSCCLPP_USE_ROCM) + return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError || + err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized; +#endif // !defined(MSCCLPP_USE_ROCM) +} + +/// Check if a CUDA driver error indicates runtime teardown. +inline bool isCuTeardownError(CUresult r) { + return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED; +} + +} // namespace mscclpp + +/// Execute a CUDA runtime call and ignore teardown errors (useful in destructors). +/// Non-teardown errors will throw. +#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \ + do { \ + cudaError_t __e = cmd; \ + if (mscclpp::isCudaTeardownError(__e)) { \ + (void)cudaGetLastError(); \ + } else { \ + MSCCLPP_CUDATHROW(__e); \ + } \ + } while (false) + +/// Execute a CUDA driver call and ignore teardown errors (useful in destructors). +/// Non-teardown errors will throw. +#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \ + do { \ + CUresult __e = cmd; \ + if (!mscclpp::isCuTeardownError(__e)) { \ + MSCCLPP_CUTHROW(__e); \ + } \ + } while (false) + +/// Execute a CUDA driver call and log (but don't throw) on error. +#define MSCCLPP_CUTHROW_IGNORE(cmd) \ + do { \ + CUresult __e = cmd; \ + if (__e != CUDA_SUCCESS) { \ + const char* errStr; \ + cuGetErrorString(__e, &errStr); \ + WARN(GPU, __FILE__, ":", __LINE__, " Cuda failure ", static_cast(__e), " '", errStr, "'"); \ + } \ + } while (false) + +#endif // MSCCLPP_GPU_UTILS_INTERNAL_HPP_ diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp index c6436dbf..36c5a237 100644 --- a/src/core/include/ib.hpp +++ b/src/core/include/ib.hpp @@ -17,6 +17,7 @@ struct ibv_qp; struct ibv_cq; struct ibv_wc; struct ibv_send_wr; +struct ibv_recv_wr; struct ibv_sge; namespace mscclpp { @@ -28,14 +29,14 @@ struct IbMrInfo { class IbMr { public: - virtual ~IbMr(); + ~IbMr(); - virtual IbMrInfo getInfo() const; - virtual const void* getBuff() const; - virtual uint32_t getLkey() const; + IbMrInfo getInfo() const; + const void* getBuff() const; + uint32_t getLkey() const; private: - IbMr(ibv_pd* pd, void* buff, std::size_t size); + IbMr(ibv_pd* pd, void* buff, std::size_t size, bool isDataDirect); ibv_mr* mr_; void* buff_; @@ -52,7 +53,7 @@ struct IbQpInfo { uint64_t spn; int mtu; uint64_t iid; - bool is_grh; + bool isGrh; }; enum class WsStatus { @@ -61,38 +62,48 @@ enum class WsStatus { class IbQp { public: - virtual ~IbQp(); + ~IbQp(); - virtual void rtr([[maybe_unused]] const IbQpInfo& info); - virtual void rts(); - virtual void stageSend([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId, - [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] bool signaled); - virtual void stageAtomicAdd([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint64_t wrId, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] uint64_t addVal, [[maybe_unused]] bool signaled); - virtual void stageSendWithImm([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId, - [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] bool signaled, [[maybe_unused]] unsigned int immData); - virtual void postSend(); - virtual int pollCq(); + void rtr(const IbQpInfo& info); + void rts(); + void stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled); + void stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, + bool signaled); + void stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData); + void postSend(); + + void stageRecv(uint64_t wrId); + void stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset = 0); + void postRecv(); + + int pollSendCq(); + int pollRecvCq(); IbQpInfo& getInfo() { return info_; } - virtual int getWcStatus([[maybe_unused]] int idx) const; - virtual std::string getWcStatusString([[maybe_unused]] int idx) const; - virtual int getNumCqItems() const; + int getSendWcStatus(int idx) const; + std::string getSendWcStatusString(int idx) const; + int getNumSendCqItems() const; + int getRecvWcStatus(int idx) const; + std::string getRecvWcStatusString(int idx) const; + unsigned int getRecvWcImmData(int idx) const; private: - struct WrInfo { + struct SendWrInfo { ibv_send_wr* wr; ibv_sge* sge; }; - IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, - int maxRecvWr, int maxWrPerSend); - WrInfo getNewWrInfo(); + struct RecvWrInfo { + ibv_recv_wr* wr; + ibv_sge* sge; + }; + + IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, + int maxRecvWr, int maxWrPerSend, bool noAtomic); + SendWrInfo getNewSendWrInfo(); + RecvWrInfo getNewRecvWrInfo(); int portNum_; int gidIndex_; @@ -100,16 +111,24 @@ class IbQp { IbQpInfo info_; ibv_qp* qp_; - ibv_cq* cq_; - std::shared_ptr> wcs_; - std::shared_ptr> wrs_; - std::shared_ptr> sges_; - int wrn_; - int numSignaledPostedItems_; - int numSignaledStagedItems_; + ibv_cq* sendCq_; + ibv_cq* recvCq_; + std::shared_ptr> sendWcs_; + std::shared_ptr> recvWcs_; + std::shared_ptr> sendWrs_; + std::shared_ptr> sendSges_; + std::shared_ptr> recvWrs_; + std::shared_ptr> recvSges_; + int numStagedSend_; + int numStagedRecv_; + int numPostedSignaledSend_; + int numStagedSignaledSend_; - const int maxCqPollNum_; + const int maxSendCqPollNum_; + const int maxSendWr_; const int maxWrPerSend_; + const int maxRecvWr_; + const bool noAtomic_; friend class IbCtx; }; @@ -120,17 +139,25 @@ class IbCtx { IbCtx(const std::string& devName); ~IbCtx(); - std::shared_ptr createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr, - int maxWrPerSend); + std::shared_ptr createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, + int maxRecvWr, int maxWrPerSend, bool noAtomic); std::unique_ptr registerMr(void* buff, std::size_t size); + bool supportsRdmaAtomics() const; + bool isMlx5() const; + bool isDataDirect() const; + bool isVirtualFunction() const; #else IbCtx([[maybe_unused]] const std::string& devName) {} ~IbCtx() {} - std::shared_ptr createQp(int, int, int, int, int, int, int) { return nullptr; } + std::shared_ptr createQp(int, int, int, int, int, int, int, bool) { return nullptr; } std::unique_ptr registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) { return nullptr; } + bool supportsRdmaAtomics() const { return false; } + bool isMlx5() const { return false; } + bool isDataDirect() const { return false; } + bool isVirtualFunction() const { return false; } #endif const std::string& getDevName() const { return devName_; }; @@ -142,6 +169,10 @@ class IbCtx { const std::string devName_; ibv_context* ctx_; ibv_pd* pd_; + bool supportsRdmaAtomics_; + bool isMlx5_; + bool isDataDirect_; + bool isVF_; }; } // namespace mscclpp diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp index 45054ff3..5b0da8ba 100644 --- a/src/core/include/ibverbs_wrapper.hpp +++ b/src/core/include/ibverbs_wrapper.hpp @@ -12,12 +12,12 @@ namespace mscclpp { struct IBVerbs { private: - static void *dlsym(const std::string &symbol, bool allowReturnNull = false); + static void* dlsym(const std::string& symbol, bool allowReturnNull = false); public: #define REGISTER_IBV_FUNC_WITH_NAME(name__, func__) \ template \ - static inline auto(name__)(Args && ...args) { \ + static inline auto(name__)(Args && ... args) { \ static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \ static decltype(&::func__) impl = nullptr; \ if (!impl) impl = reinterpret_cast(IBVerbs::dlsym(#func__)); \ @@ -46,7 +46,7 @@ struct IBVerbs { REGISTER_IBV_FUNC(ibv_wc_status_str) static bool isDmabufSupported(); - static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int); + static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int); /// /// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`. @@ -57,8 +57,8 @@ struct IBVerbs { #else // defined(ibv_get_device_list) #undef ibv_get_device_list REGISTER_IBV_FUNC(ibv_static_providers) - static inline struct ibv_device **ibv_get_device_list(int *num_devices) { - using FuncType = struct ibv_device **(*)(int *); + static inline struct ibv_device** ibv_get_device_list(int* num_devices) { + using FuncType = struct ibv_device** (*)(int*); static FuncType impl = nullptr; if (!impl) impl = reinterpret_cast(IBVerbs::dlsym("ibv_get_device_list")); IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL); @@ -67,21 +67,21 @@ struct IBVerbs { #endif // defined(ibv_get_device_list) #undef ibv_query_port - static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { + static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) { static decltype(&::ibv_query_port) impl = nullptr; if (!impl) impl = reinterpret_cast(IBVerbs::dlsym("ibv_query_port")); - struct verbs_context *vctx = verbs_get_ctx_op(context, query_port); + struct verbs_context* vctx = verbs_get_ctx_op(context, query_port); if (!vctx) { int rc; ::memset(port_attr, 0, sizeof(*port_attr)); - rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr); + rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr); return rc; } return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr)); } #undef ibv_reg_mr - static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { + static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) { static decltype(&::ibv_reg_mr) impl = nullptr; static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr; int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0); @@ -98,11 +98,15 @@ struct IBVerbs { /// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`. /// - static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { + static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) { return qp->context->ops.post_send(qp, wr, bad_wr); } - static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) { + static inline int ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struct ibv_recv_wr** bad_wr) { + return qp->context->ops.post_recv(qp, wr, bad_wr); + } + + static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) { return cq->context->ops.poll_cq(cq, num_entries, wc); } }; diff --git a/src/core/include/mlx5dv_wrapper.hpp b/src/core/include/mlx5dv_wrapper.hpp new file mode 100644 index 00000000..79403a36 --- /dev/null +++ b/src/core/include/mlx5dv_wrapper.hpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_MLX5DV_WRAPPER_HPP_ +#define MSCCLPP_MLX5DV_WRAPPER_HPP_ + +#if defined(MSCCLPP_USE_MLX5DV) + +#include + +#include + +namespace mscclpp { + +struct MLX5DV { + /// Whether libmlx5.so was successfully loaded at runtime. + static bool isAvailable(); + + /// Check if the given IB device supports mlx5 Direct Verbs. + static bool mlx5dv_is_supported(struct ibv_device* device); + + /// Register a DMABUF memory region using mlx5dv extensions. + /// Returns nullptr if mlx5dv_reg_dmabuf_mr is not available in this rdma-core version. + static struct ibv_mr* mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd, + int access); + + /// Query the Data Direct sysfs path for the given IB context. + /// Returns 0 on success (device supports Data Direct), non-zero otherwise. + static int mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len); + + private: + static void* dlsym(const std::string& symbol, bool allowReturnNull = false); +}; + +} // namespace mscclpp + +#endif // defined(MSCCLPP_USE_MLX5DV) +#endif // MSCCLPP_MLX5DV_WRAPPER_HPP_ diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp new file mode 100644 index 00000000..463f827d --- /dev/null +++ b/src/core/include/reduce_kernel.hpp @@ -0,0 +1,195 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_REDUCE_KERNEL_HPP_ +#define MSCCLPP_REDUCE_KERNEL_HPP_ + +#include +#include +#include + +namespace mscclpp { + +#if defined(MSCCLPP_DEVICE_COMPILE) + +// Generic element-wise calculation helper +template +MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) { + if constexpr (OpType == SUM) { + return a + b; + } else if constexpr (OpType == MIN) { + return mscclpp::min(a, b); + } + static_assert(OpType == SUM || OpType == MIN, "Unsupported ReduceOp"); +} + +// Generic vector reduction helpers + +template +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { + uint2 ret; + ret.x = bit_cast(calElements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(calElements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available). +template <> +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { + f32x2 fa = bit_cast(a); + f32x2 fb = bit_cast(b); + f32x2 fr = fa + fb; + return bit_cast(fr); +} + +template <> +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { + f32x2 fa = bit_cast(a); + f32x2 fb = bit_cast(b); + f32x2 fr = mscclpp::min(fa, fb); + return bit_cast(fr); +} + +template +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + int4 ret; + ret.w = bit_cast(calElements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(calElements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(calElements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(calElements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic. +template <> +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y}; + uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w}; + uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y}; + uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w}; + uint2 lo_r = calVectorHelper(lo_a, lo_b); + uint2 hi_r = calVectorHelper(hi_a, hi_b); + return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y}; +} + +template <> +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y}; + uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w}; + uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y}; + uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w}; + uint2 lo_r = calVectorHelper(lo_a, lo_b); + uint2 hi_r = calVectorHelper(hi_a, hi_b); + return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y}; +} + +template +MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) { + return bit_cast(calElements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + return bit_cast(calElements(bit_cast(a), bit_cast(b))); +} + +/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback). +template <> +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + float fa = bit_cast(a); + float fb = bit_cast(b); + return bit_cast(fa + fb); +} + +template <> +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + float fa = bit_cast(a); + float fb = bit_cast(b); + return bit_cast(fminf(fa, fb)); +} + +// calVector wrapper – converts scalar types to vector types and calls calVectorHelper +template +MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) { + // Define the vectorized computation type based on the element type + static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size"); + static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes"); + using CompType = typename std::conditional_t< + std::is_same_v, f32x2, + std::conditional_t< + std::is_same_v, f16x2, + std::conditional_t< + std::is_same_v, bf16x2, + std::conditional_t< + std::is_same_v, u8x4, + std::conditional_t, f8_e4m3b15x4, +#if defined(__FP8_TYPES_EXIST__) + std::conditional_t, f8_e4m3x4, + std::conditional_t, f8_e5m2x4, T>> +#else + T +#endif + >>>>>; + return calVectorHelper(a, b); +} + +/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements). +/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics). +/// When AccumT == T, this is a no-op identity. +template +MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) { + if constexpr (std::is_same_v) { + return val; + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + ToVec result = mscclpp::to(reinterpret_cast(val)); + return reinterpret_cast(result); + } +} + +/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements). +/// Uses the optimized to<>() specializations when available. +/// When AccumT == T, this is a no-op identity. +template +MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) { + if constexpr (std::is_same_v) { + return val; + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + FromVec result = mscclpp::to(reinterpret_cast(val)); + return reinterpret_cast(result); + } +} + +/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType). +/// When AccumT == T, falls back to the standard calVector. +/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator. +template +MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) { + if constexpr (std::is_same_v) { + return calVector(acc, val); + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + + ToVec fv = mscclpp::to(reinterpret_cast(val)); + const ToVec& fa = reinterpret_cast(acc); + ToVec fr; +#pragma unroll + for (int i = 0; i < nElems; ++i) { + fr.data[i] = calElements(fa.data[i], fv.data[i]); + } + return reinterpret_cast(fr); + } +} + +#endif // defined(MSCCLPP_DEVICE_COMPILE) + +} // namespace mscclpp + +#endif // MSCCLPP_REDUCE_KERNEL_HPP_ diff --git a/src/core/mlx5dv_wrapper.cc b/src/core/mlx5dv_wrapper.cc new file mode 100644 index 00000000..a56fad96 --- /dev/null +++ b/src/core/mlx5dv_wrapper.cc @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#if defined(MSCCLPP_USE_MLX5DV) + +// _GNU_SOURCE is required for dlvsym() +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "mlx5dv_wrapper.hpp" + +#include +#include + +#ifndef MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT +#define MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT (1 << 0) +#endif + +#include + +#include "logger.hpp" + +namespace mscclpp { + +static std::unique_ptr globalMLX5Handle(nullptr, &::dlclose); + +void* MLX5DV::dlsym(const std::string& symbol, bool allowReturnNull) { + if (!globalMLX5Handle) { + const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr}; + for (int i = 0; possibleLibNames[i] != nullptr; i++) { + void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW); + if (handle) { + globalMLX5Handle.reset(handle); + break; + } + } + if (!globalMLX5Handle) { + if (allowReturnNull) return nullptr; + THROW(NET, SysError, errno, "Failed to open libmlx5: ", std::string(::dlerror())); + } + } + void* ptr = ::dlsym(globalMLX5Handle.get(), symbol.c_str()); + if (!ptr && !allowReturnNull) { + THROW(NET, SysError, errno, "Failed to load libmlx5 symbol: ", symbol); + } + return ptr; +} + +bool MLX5DV::isAvailable() { + static int available = -1; + if (available == -1) { + // Try to load the library; if it fails, mlx5dv is not available + const char* possibleLibNames[] = {"libmlx5.so", "libmlx5.so.1", nullptr}; + for (int i = 0; possibleLibNames[i] != nullptr; i++) { + void* handle = ::dlopen(possibleLibNames[i], RTLD_NOW); + if (handle) { + if (!globalMLX5Handle) { + globalMLX5Handle.reset(handle); + } else { + ::dlclose(handle); + } + available = 1; + INFO(NET, "libmlx5 loaded successfully"); + return true; + } + } + available = 0; + DEBUG(NET, "libmlx5 not available"); + } + return available == 1; +} + +bool MLX5DV::mlx5dv_is_supported(struct ibv_device* device) { + using FuncType = bool (*)(struct ibv_device*); + static FuncType impl = nullptr; + if (!impl) { + void* ptr = MLX5DV::dlsym("mlx5dv_is_supported", /*allowReturnNull=*/true); + if (!ptr) return false; + impl = reinterpret_cast(ptr); + } + return impl(device); +} + +struct ibv_mr* MLX5DV::mlx5dv_reg_dmabuf_mr(struct ibv_pd* pd, uint64_t offset, size_t length, uint64_t iova, int fd, + int access) { + // mlx5dv_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access) — the last arg is mlx5-specific flags. + // Must use dlvsym with "MLX5_1.25" version to get the Data Direct-capable symbol. + using FuncType = struct ibv_mr* (*)(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int, int); + static FuncType impl = nullptr; + static bool resolved = false; + if (!resolved) { + if (globalMLX5Handle) { + void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_reg_dmabuf_mr", "MLX5_1.25"); + if (!ptr) { + ptr = MLX5DV::dlsym("mlx5dv_reg_dmabuf_mr", /*allowReturnNull=*/true); + } + impl = ptr ? reinterpret_cast(ptr) : nullptr; + } + resolved = true; + } + if (!impl) return nullptr; + return impl(pd, offset, length, iova, fd, access, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT); +} + +int MLX5DV::mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) { + using FuncType = int (*)(struct ibv_context*, char*, size_t); + static FuncType impl = nullptr; + static bool resolved = false; + if (!resolved) { + if (globalMLX5Handle) { + void* ptr = dlvsym(globalMLX5Handle.get(), "mlx5dv_get_data_direct_sysfs_path", "MLX5_1.25"); + if (!ptr) { + ptr = MLX5DV::dlsym("mlx5dv_get_data_direct_sysfs_path", /*allowReturnNull=*/true); + } + impl = ptr ? reinterpret_cast(ptr) : nullptr; + } + resolved = true; + } + if (!impl) return -1; + return impl(context, buf, buf_len); +} + +} // namespace mscclpp + +#endif // defined(MSCCLPP_USE_MLX5DV) diff --git a/src/core/npkit/npkit.cc b/src/core/npkit/npkit.cc index 30fc35c7..84457abf 100644 --- a/src/core/npkit/npkit.cc +++ b/src/core/npkit/npkit.cc @@ -103,10 +103,10 @@ static int GetGpuClockRateInKhz() { else return 25000; #else - cudaDeviceProp dev_prop; + int clockRate; MSCCLPP_CUDATHROW(cudaGetDevice(&dev_id)); - MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&dev_prop, dev_id)); - return dev_prop.clockRate; + MSCCLPP_CUDATHROW(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev_id)); + return clockRate; #endif } #endif diff --git a/src/core/proxy.cc b/src/core/proxy.cc index 2a980505..de5b90fc 100644 --- a/src/core/proxy.cc +++ b/src/core/proxy.cc @@ -59,11 +59,15 @@ MSCCLPP_API_CPP Proxy::~Proxy() { MSCCLPP_API_CPP void Proxy::start(bool blocking) { pimpl_->running.store(true, std::memory_order_release); pimpl_->service = std::thread([this] { + // threadInit() is responsible for setting up the runtime context for the thread. + // The default implementation sets the CUDA device and NUMA affinity to match the main thread (see Proxy ctor). + // It should be called before any CUDA API calls to avoid resource allocation on unwanted GPUs. + pimpl_->threadInit(); + // never capture in a proxy thread auto mode = cudaStreamCaptureModeRelaxed; MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode)); - pimpl_->threadInit(); pimpl_->threadStarted.store(true, std::memory_order_release); ProxyHandler handler = this->pimpl_->handler; diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc index 57ac5979..49a3791b 100644 --- a/src/core/semaphore.cc +++ b/src/core/semaphore.cc @@ -8,7 +8,7 @@ #include "atomic.hpp" #include "connection.hpp" #include "context.hpp" -#include "debug.h" +#include "logger.hpp" #include "registered_memory.hpp" #include "serialization.hpp" @@ -49,12 +49,12 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection token_ = std::make_shared(0); } else if (localDevice.type == DeviceType::GPU) { if (localDevice.id < 0) { - throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Local GPU ID is not provided"); } CudaDeviceGuard deviceGuard(localDevice.id); token_ = gpuCallocToken(connection_.context()); } else { - throw Error("Unsupported local device type", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Unsupported local device type"); } idMemory_ = std::move(connection_.context()->registerMemory(token_.get(), sizeof(uint64_t), connection_.transport())); } @@ -79,7 +79,7 @@ MSCCLPP_API_CPP SemaphoreStub SemaphoreStub::deserialize(const std::vector RegisteredMemory idMemory(std::make_shared(data.begin(), memEnd)); auto it = detail::deserialize(memEnd, device); if (it != data.end()) { - throw Error("SemaphoreStub deserialize failed", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "SemaphoreStub deserialize failed"); } return SemaphoreStub(std::make_shared(std::move(idMemory), device)); } @@ -120,13 +120,35 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema expectedInboundToken_(detail::gpuCallocUnique()), outboundToken_(std::make_unique()) { if (connection().localDevice().type != DeviceType::GPU) { - throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2DeviceSemaphore should be GPU"); } + auto connImpl = BaseConnection::getImpl(connection()); + if (connImpl->isSignalForwarding()) { + // Signal forwarding (HostNoAtomic): the receiver's recv thread polls the recv CQ for + // WRITE_WITH_IMM completions, then forwards the token to inboundToken_ via GDRCopy. + CudaDeviceGuard deviceGuard(connection().localDevice().id); +#if defined(MSCCLPP_USE_ROCM) + inboundToken_ = detail::gpuCallocUncachedShared(); +#else + inboundToken_ = detail::gpuCallocShared(); +#endif + connImpl->startSignalForwarding(inboundToken_); + } + // When isSignalForwarding() is false (atomic mode), inboundToken_ stays null + // and the GPU polls the SemaphoreStub token directly (the NIC atomic target). } MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection) : Host2DeviceSemaphore(buildSemaphoreFromConnection(communicator, connection)) {} +MSCCLPP_API_CPP Host2DeviceSemaphore::~Host2DeviceSemaphore() { + if (inboundToken_) { + // Clear the connection's signal forwarding destination (and GdrMap) + // before inboundToken_ is freed, to avoid use-after-free on the pinned GPU memory. + BaseConnection::getImpl(connection())->stopSignalForwarding(); + } +} + MSCCLPP_API_CPP Connection& Host2DeviceSemaphore::connection() { return semaphore_.connection(); } MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() { @@ -135,7 +157,11 @@ MSCCLPP_API_CPP void Host2DeviceSemaphore::signal() { MSCCLPP_API_CPP Host2DeviceSemaphore::DeviceHandle Host2DeviceSemaphore::deviceHandle() const { Host2DeviceSemaphore::DeviceHandle device; - device.inboundToken = reinterpret_cast(semaphore_.localMemory().data()); + // If inboundToken_ is allocated (signal forwarding mode), the GPU polls it. + // Otherwise (atomic mode), the GPU polls the SemaphoreStub token directly, + // which is the same address targeted by the NIC's atomic operation. + device.inboundToken = + inboundToken_ ? inboundToken_.get() : reinterpret_cast(semaphore_.localMemory().data()); device.expectedInboundToken = expectedInboundToken_.get(); return device; } @@ -145,10 +171,18 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor expectedInboundToken_(std::make_unique()), outboundToken_(std::make_unique()) { if (connection().transport() == Transport::CudaIpc) { - throw Error("Host2HostSemaphore cannot be used with CudaIpc transport", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Host2HostSemaphore cannot be used with CudaIpc transport"); } if (connection().localDevice().type != DeviceType::CPU) { - throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Local endpoint device type of Host2HostSemaphore should be CPU"); + } + auto connImpl = BaseConnection::getImpl(connection()); + if (connImpl->isSignalForwarding()) { + // Signal forwarding mode: tell the recv thread where to write the incoming token. + // Non-owning shared_ptr: Host2HostSemaphore outlives the connection, so the memory stays valid. + auto token = + std::shared_ptr(reinterpret_cast(semaphore_.localMemory().data()), [](uint64_t*) {}); + connImpl->startSignalForwarding(std::move(token)); } } @@ -174,17 +208,16 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) { while (atomicLoad(reinterpret_cast(semaphore_.localMemory().data()), memoryOrderAcquire) < (*expectedInboundToken_)) { if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) { - throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout); + THROW(CONN, Error, ErrorCode::Timeout, "Host2HostSemaphore::wait timed out"); } } } MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore) - : semaphore_(semaphore), - expectedInboundToken_(detail::gpuCallocUnique()), - outboundToken_(detail::gpuCallocUnique()) { + : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique()) { if (connection().localDevice().type != DeviceType::GPU) { - throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, + "Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU"); } } @@ -199,7 +232,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe device.remoteInboundToken = reinterpret_cast(semaphore_.remoteMemory().data()); device.inboundToken = reinterpret_cast(semaphore_.localMemory().data()); device.expectedInboundToken = expectedInboundToken_.get(); - device.outboundToken = outboundToken_.get(); return device; }; diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc index 2e7b2920..7ba97a3c 100644 --- a/src/ext/collectives/algorithm_collection_builder.cc +++ b/src/ext/collectives/algorithm_collection_builder.cc @@ -8,12 +8,15 @@ #include "allgather/allgather_fullmesh_2.hpp" #include "allreduce/allreduce_allpair_packet.hpp" #include "allreduce/allreduce_fullmesh.hpp" -#include "allreduce/allreduce_nvls.hpp" +#include "allreduce/allreduce_nvls_block_pipeline.hpp" #include "allreduce/allreduce_nvls_packet.hpp" -#include "allreduce/allreduce_nvls_with_copy.hpp" -#include "allreduce/allreduce_nvls_with_copy_2.hpp" +#include "allreduce/allreduce_nvls_warp_pipeline.hpp" +#include "allreduce/allreduce_nvls_zero_copy.hpp" #include "allreduce/allreduce_packet.hpp" #include "alltoallv/alltoallv_fullmesh.hpp" +#include "allreduce/allreduce_rsag.hpp" +#include "allreduce/allreduce_rsag_pipeline.hpp" +#include "allreduce/allreduce_rsag_zero_copy.hpp" #include "logger.hpp" namespace mscclpp { @@ -50,8 +53,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() { void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); } AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer, - size_t scratchBufferSize, int rank) { - auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize); + size_t scratchBufferSize, uintptr_t flagBuffer, + size_t flagBufferSize, int rank) { + auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize); auto dslCollection = buildDefaultDslAlgorithms(rank); nativeCollection.extend(dslCollection); nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_); @@ -59,24 +63,39 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t } AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, - size_t scratchBufferSize) { + size_t scratchBufferSize, + uintptr_t flagBuffer, + size_t flagBufferSize) { AlgorithmCollection collection; - auto allreduceAllpairPkt = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreduceAllpairPkt = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt); - auto allreduceNvlsPacket = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreduceNvlsPacket = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket); - auto allreduceNvlsWithCopy = std::make_shared(scratchBuffer, scratchBufferSize)->build(); - collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(), - allreduceNvlsWithCopy); - auto allreduceNvlsWithCopy2 = std::make_shared(scratchBuffer, scratchBufferSize)->build(); - collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(), - allreduceNvlsWithCopy2); - auto allreducePkt = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreduceNvlsWarpPipeline = + std::make_shared(scratchBuffer, scratchBufferSize)->build(); + collection.registerAlgorithm(allreduceNvlsWarpPipeline->collective(), allreduceNvlsWarpPipeline->name(), + allreduceNvlsWarpPipeline); + auto allreduceNvlsBlockPipeline = + std::make_shared(scratchBuffer, scratchBufferSize)->build(); + collection.registerAlgorithm(allreduceNvlsBlockPipeline->collective(), allreduceNvlsBlockPipeline->name(), + allreduceNvlsBlockPipeline); + auto allreducePkt = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt); auto allreduceNvls = std::make_shared()->build(); collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls); auto allreduceFullmesh = std::make_shared(scratchBuffer, scratchBufferSize)->build(); collection.registerAlgorithm(allreduceFullmesh->collective(), allreduceFullmesh->name(), allreduceFullmesh); + auto allreduceRsag = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + collection.registerAlgorithm(allreduceRsag->collective(), allreduceRsag->name(), allreduceRsag); + auto allreduceRsagPipeline = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + collection.registerAlgorithm(allreduceRsagPipeline->collective(), allreduceRsagPipeline->name(), + allreduceRsagPipeline); + auto allreduceRsagZeroCopy = std::make_shared()->build(); + collection.registerAlgorithm(allreduceRsagZeroCopy->collective(), allreduceRsagZeroCopy->name(), + allreduceRsagZeroCopy); auto allgatherFullmesh = std::make_shared(scratchBuffer, scratchBufferSize)->build(); collection.registerAlgorithm(allgatherFullmesh->collective(), allgatherFullmesh->name(), allgatherFullmesh); @@ -110,13 +129,13 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra return oss.str(); }; - std::string planDir = env()->executionPlanDir; + auto planDir = std::filesystem::path(env()->cacheDir) / "default"; if (!std::filesystem::exists(planDir)) { - INFO(ALGO, "Plan directory does not exist: ", planDir); + INFO(ALGO, "Default plan directory does not exist: ", planDir); return collection; } for (const auto& config : defaultAlgoConfigs) { - std::string planPath = planDir + "/" + config.filename; + auto planPath = planDir / config.filename; INFO(ALGO, "Loading plan: ", planPath); if (!std::filesystem::exists(planPath)) { INFO(ALGO, "Plan file does not exist: ", planPath); diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 34f8d4e7..fb51a342 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -170,7 +170,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr AllgatherFullmesh::build() { [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) -> CommResult { + const std::unordered_map& extras, + [[maybe_unused]] DataType accumDtype) -> CommResult { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllgatherContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 84f14ca2..9d169d68 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1) } } -AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) { - if (mscclpp::env()->disableChannelCache) { - disableChannelCache_ = true; - } -} - void AllgatherFullmesh2::initialize(std::shared_ptr comm) { this->conns_ = setupConnections(comm); this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); @@ -174,7 +168,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptrbootstrap()->getNranks(); recvBasePtr = (CUdeviceptr)output; @@ -197,10 +191,11 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr AllgatherFullmesh2::build() { [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) -> mscclpp::CommResult { + const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index a4881093..17bcfc33 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -2,6 +2,7 @@ // Licensed under the MIT license. #include +#include #include "allreduce/allreduce_allpair_packet.hpp" #include "allreduce/common.hpp" @@ -11,29 +12,18 @@ namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; - -template +template __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, - int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) { + int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, + uint32_t flagSize) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); const int nPeers = nRanksPerNode - 1; - uint32_t flag = 0; - if constexpr (flagPerBlock) { - flag = ((uint32_t*)flags)[blockIdx.x]; - } else { - flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } - } - + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0; size_t channelScratchOffset = scratchBaseOffset; @@ -54,30 +44,23 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand // step 2: Reduce Data for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nelems; idx += blockDim.x * gridDim.x) { uint32_t data = src[idx]; + using AccRaw = std::conditional_t, uint32_t, + mscclpp::VectorType>; + AccRaw acc = mscclpp::upcastVector(data); for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems; uint32_t val = dstPkt[idx].read(flag, -1); - data = cal_vectors(val, data); + acc = mscclpp::calVectorAccum(acc, val); } - dst[idx] = data; + dst[idx] = mscclpp::downcastVector(acc); } - if constexpr (flagPerBlock) { - __syncthreads(); - if (threadIdx.x == 0) { - ((uint32_t*)flags)[blockIdx.x] = flag + 1; - } - } else { - // Make sure all threadblocks have finished reading before incrementing the flag - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); - } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; - } + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; + } + if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } } @@ -88,24 +71,23 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int return {(worldSize - 1) * 4, 512}; } -template +template struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, - cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0, + cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); - if (nBlocks == 7 || nBlocks == 28) { - allreduceAllPairs<<>>( - (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags); - return cudaGetLastError(); + // Round nBlocks to multiple of nPeers so every block maps to a valid peer. + const int nPeers = worldSize - 1; + if (nPeers > 0) { + nBlocks = (nBlocks / nPeers) * nPeers; } - allreduceAllPairs<<>>( + allreduceAllPairs<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags); + nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize); return cudaGetLastError(); } }; @@ -116,44 +98,38 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr comm) { RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory); registeredMemories_.push_back(scratchMemory); - flags_ = detail::gpuCallocShared(maxBlockNum_); - std::vector flags(28, 1); - flags7_ = detail::gpuCallocShared(7); - flags28_ = detail::gpuCallocShared(28); - gpuMemcpy(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice); - gpuMemcpy(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice); } CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); std::pair blockAndThreadNum{nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize, algoCtx->workSize); } - void* flags = this->flags_.get(); - if (blockAndThreadNum.first == 7) { - flags = this->flags7_.get(); - } else if (blockAndThreadNum.first == 28) { - flags = this->flags28_.get(); + // nBlocks must be at least nPeers for allpair — each block maps to one peer. + const int nPeers = algoCtx->nRanksPerNode - 1; + if (nPeers > 0 && blockAndThreadNum.first < nPeers) { + return CommResult::CommInvalidArgument; } - size_t sendBytes; CUdeviceptr sendBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); size_t channelInOffset = (char*)input - (char*)sendBasePtr; - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast(dtype)); return CommResult::CommInvalidArgument; } - cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, - nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, - algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags, - this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); + cudaError_t error = + allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr, + nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode, + algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, + this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -185,7 +161,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p return ctx; } -AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) { size_t sendBytes; CUdeviceptr sendBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); @@ -193,21 +169,23 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* } std::shared_ptr AllreduceAllpairPacket::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); + auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_, + flagBuffer_, flagBufferSize_); return std::make_shared( "default_allreduce_allpair_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index e8cd93bb..24d2a31c 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -9,7 +9,7 @@ namespace mscclpp { namespace collective { -template +template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, int rank, @@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1) int4* scratch4 = reinterpret_cast((char*)scratch); int4* resultBuff4 = reinterpret_cast(resultBuff); + // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op). + constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); + using AccumVec = std::conditional_t, int4, mscclpp::VectorType>; + // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` constexpr size_t unitNInt4 = 512; const size_t maxNInt4PerBlock = @@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + AccumVec acc = mscclpp::upcastVector(rawData); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vectors(val, data); + acc = mscclpp::calVectorAccum(acc, val); } + int4 data = mscclpp::downcastVector(acc); resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), @@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + AccumVec acc = mscclpp::upcastVector(rawData); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vectors(val, data); + acc = mscclpp::calVectorAccum(acc, val); } + int4 data = mscclpp::downcastVector(acc); resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), @@ -144,17 +152,18 @@ __global__ void __launch_bounds__(512, 1) } } -template +template struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize, - size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, + int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0) nBlocks = 35; if (nThreadsPerBlock == 0) nThreadsPerBlock = 512; - allreduceFullmesh<<>>( + allreduceFullmesh<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels, channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems); return cudaGetLastError(); @@ -173,15 +182,18 @@ void AllreduceFullmesh::initialize(std::shared_ptr comm) { localScratchMemory_ = std::move(localMemory); } -CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, - size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, - int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceFullmesh::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); size_t recvBytes; CUdeviceptr recvBasePtr; - MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); - size_t channelOutOffset = (char*)output - (char*)recvBasePtr; + size_t channelOutOffset = 0; + if (symmetricMemory_) { + MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); + channelOutOffset = (char*)output - (char*)recvBasePtr; + } std::shared_ptr> inputChannelHandles; if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) { inputChannelHandles = this->memoryChannelsMap_[input].second; @@ -194,17 +206,24 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct } inputChannelHandles = this->memoryChannelsMap_[input].second; - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast(op), static_cast(dtype)); return CommResult::CommInvalidArgument; } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + if (numBlocksAndThreads.first > 64) { + WARN("AllreduceFullmesh: number of blocks exceeds maximum supported blocks, which is 64"); + return mscclpp::CommResult::CommInvalidArgument; + } + if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { + numBlocksAndThreads = {35, 512}; + } cudaError_t error = allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, - stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -212,19 +231,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct return CommResult::CommSuccess; } -AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) { +AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType, + bool symmetricMemory) { static int tag = 0; size_t recvBytes; CUdeviceptr recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); - if (env()->disableChannelCache) { + symmetricMemory_ = symmetricMemory; + if (!symmetricMemory_) { return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++}; } return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0}; } std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr comm, const void*, - void* output, size_t, DataType) { + void* output, size_t size, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); @@ -236,6 +257,10 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptrregisterMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc); ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory); ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory, @@ -251,15 +276,17 @@ std::shared_ptr AllreduceFullmesh::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) -> CommResult { + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu similarity index 72% rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu rename to src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index 2a109c6f..2d71cd63 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -3,7 +3,7 @@ #include -#include "allreduce/allreduce_nvls_with_copy_2.hpp" +#include "allreduce/allreduce_nvls_block_pipeline.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" #include "debug.h" @@ -15,11 +15,12 @@ __device__ DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES]; template __global__ void __launch_bounds__(1024, 1) - allreduceNvlsWithCopy2([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst, - [[maybe_unused]] DeviceHandle* memoryChannels, - [[maybe_unused]] DeviceHandle* switchChannels, [[maybe_unused]] size_t size, - [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, - [[maybe_unused]] int nRanksPerNode) { + allreduceNvlsBlockPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, + [[maybe_unused]] void* dst, + [[maybe_unused]] DeviceHandle* memoryChannels, + [[maybe_unused]] DeviceHandle* switchChannels, + [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize, + [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; int nPeers = nRanksPerNode - 1; @@ -145,28 +146,35 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template -struct NvlsWithCopy2Adapter { +template +struct NvlsBlockPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { -#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS - if constexpr (std::is_same_v || std::is_same_v) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) + if constexpr (std::is_same_v) { + return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. return cudaErrorNotSupported; } else +#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS + if constexpr (std::is_same_v || std::is_same_v) { + return cudaErrorNotSupported; + } else #endif - { - using ChannelType = DeviceHandle; - allreduceNvlsWithCopy2 - <<>>(input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, - inputSize, scratchBufferSize, rank, nRanksPerNode); - return cudaGetLastError(); - } + { + using ChannelType = DeviceHandle; + allreduceNvlsBlockPipeline + <<>>(input, scratch, output, (ChannelType*)memoryChannels, + nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + return cudaGetLastError(); + } } }; -void AllreduceNvlsWithCopy2::initialize(std::shared_ptr comm) { +void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; int nBaseChannels = 64; this->conns_ = setupConnections(comm); @@ -176,14 +184,16 @@ void AllreduceNvlsWithCopy2::initialize(std::shared_ptr comm) { // setup base memory channels this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); + this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } -CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, - void* output, size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, + void* output, size_t inputSize, DataType dtype, ReduceOp op, + cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + const std::unordered_map& extras, + DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -194,52 +204,53 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, + ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error)); + WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; } -AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceNvlsBlockPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; } -std::shared_ptr AllreduceNvlsWithCopy2::initAllreduceContext(std::shared_ptr comm, const void*, - void*, size_t, DataType) { +std::shared_ptr AllreduceNvlsBlockPipeline::initAllreduceContext(std::shared_ptr comm, const void*, + void*, size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // setup channels - ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); ctx->switchChannels = - setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } -std::shared_ptr AllreduceNvlsWithCopy2::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); +std::shared_ptr AllreduceNvlsBlockPipeline::build() { + auto self = + std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); return std::make_shared( - "default_allreduce_nvls_with_copy2", "allreduce", + "default_allreduce_nvls_block_pipeline", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index aafe7566..a616485e 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -1,33 +1,25 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include + #include "allreduce/allreduce_nvls_packet.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" -#include "debug.h" +#include "logger.hpp" namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; -template +template __global__ void __launch_bounds__(1024, 1) allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output, [[maybe_unused]] mscclpp::DeviceHandle* multicast, [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize, - [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) { + [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags, + [[maybe_unused]] uint32_t flagBufferSize) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - uint32_t flag = 0; - if constexpr (flagPerBlock) { - flag = ((uint32_t*)flags)[blockIdx.x]; - } else { - flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } - } - + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0; uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T)); @@ -41,31 +33,24 @@ __global__ void __launch_bounds__(1024, 1) mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i); } for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) { - uint data = src[i]; + // When T == AccumT, stay with raw uint to avoid type mismatch in identity path. + using AccRaw = + std::conditional_t, uint, mscclpp::VectorType>; + AccRaw acc = mscclpp::upcastVector(src[i]); for (int peer = 0; peer < worldSize; peer++) { - if (peer == rank) { - continue; - } + if (peer == rank) continue; uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag); - data = cal_vectors(data, val); + acc = mscclpp::calVectorAccum(acc, val); } - dst[i] = data; + dst[i] = mscclpp::downcastVector(acc); } - if constexpr (flagPerBlock) { - __syncthreads(); - if (threadIdx.x == 0) { - ((uint32_t*)flags)[blockIdx.x] = flag + 1; - } - } else { - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); - } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; - } + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; + } + // update other flags in-case using different number of blocks in next launch + if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } #endif } @@ -80,35 +65,27 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize) { return {blockNum, threadNum}; } -template +template struct AllreduceNvlsPacketAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void*, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream, - void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) { - if (nBlocks == 4 || nBlocks == 8) { - allreduceNvlsPacket - <<>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels, - inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags); - } else { - allreduceNvlsPacket - <<>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels, - inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags); - } + void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) { + allreduceNvlsPacket<<>>( + (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank, + worldSize, flags, flagBufferSize); return cudaGetLastError(); } }; -void AllreduceNvlsPacket::initialize(std::shared_ptr) { - std::vector flags(8, 1); - flags_ = detail::gpuCallocShared(16); - flags4_ = detail::gpuCallocShared(4); - flags8_ = detail::gpuCallocShared(8); - gpuMemcpy(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice); - gpuMemcpy(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice); +void AllreduceNvlsPacket::initialize(std::shared_ptr comm) { + int nSwitchChannels = 1; + this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); + this->switchChannels_ = + setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); } -AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; } @@ -120,10 +97,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // setup channels - int nSwitchChannels = 1; - ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); - ctx->switchChannels = - setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); + ctx->switchChannels = this->switchChannels_; ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } @@ -131,54 +105,53 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + mscclpp::DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize); } if (blockAndThreadNum.first > maxBlockNum_) { - WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_); + WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_); return CommResult::CommInvalidArgument; } - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { - WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); + WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast(dtype)); return CommResult::CommInvalidArgument; } - void* flags = this->flags_.get(); - if (blockAndThreadNum.first == 4) { - flags = this->flags4_.get(); - } else if (blockAndThreadNum.first == 8) { - flags = this->flags8_.get(); - } cudaError_t error = allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, - 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags, - 0, blockAndThreadNum.first, blockAndThreadNum.second); + 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, + (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error)); + WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; } std::shared_ptr AllreduceNvlsPacket::build() { - auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_); + auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_, + flagBufferSize_); return std::make_shared( "default_allreduce_nvls_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu similarity index 71% rename from src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu rename to src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index 113fdb7c..3bb054da 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -3,7 +3,7 @@ #include -#include "allreduce/allreduce_nvls_with_copy.hpp" +#include "allreduce/allreduce_nvls_warp_pipeline.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" #include "debug.h" @@ -13,11 +13,12 @@ namespace collective { template __global__ void __launch_bounds__(1024, 1) - allreduce10([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst, - [[maybe_unused]] DeviceHandle* memoryChannels, - [[maybe_unused]] DeviceHandle* multicast, [[maybe_unused]] size_t size, - [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, - [[maybe_unused]] int nRanksPerNode) { + allreduceNvlsWarpPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, + [[maybe_unused]] void* dst, + [[maybe_unused]] DeviceHandle* memoryChannels, + [[maybe_unused]] DeviceHandle* multicast, [[maybe_unused]] size_t size, + [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank, + [[maybe_unused]] int nRanksPerNode) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 constexpr int alignment = 16; int nPeers = nRanksPerNode - 1; @@ -108,28 +109,35 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template -struct NvlsWithCopyAdapter { +template +struct NvlsWarpPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { -#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS - if constexpr (std::is_same_v || std::is_same_v) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) + if constexpr (std::is_same_v) { + return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. return cudaErrorNotSupported; } else +#if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS + if constexpr (std::is_same_v || std::is_same_v) { + return cudaErrorNotSupported; + } else #endif - { - using ChannelType = DeviceHandle; - allreduce10<<>>(input, scratch, output, (ChannelType*)memoryChannels, - nvlsChannels, inputSize, scratchBufferSize, rank, - nRanksPerNode); - return cudaGetLastError(); - } + { + using ChannelType = DeviceHandle; + allreduceNvlsWarpPipeline + <<>>(input, scratch, output, (ChannelType*)memoryChannels, + nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode); + return cudaGetLastError(); + } } }; -void AllreduceNvlsWithCopy::initialize(std::shared_ptr comm) { +void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { nSwitchChannels_ = 8; int nBaseChannels = 64; this->conns_ = setupConnections(comm); @@ -139,14 +147,15 @@ void AllreduceNvlsWithCopy::initialize(std::shared_ptr comm) { // setup base memory channels this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); + this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } -CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, - void* output, size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -157,51 +166,52 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, + ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error)); + WARN("AllreduceNvlsWarpPipeline failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; } -AlgorithmCtxKey AllreduceNvlsWithCopy::generateAllreduceContextKey(const void*, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceNvlsWarpPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; } -std::shared_ptr AllreduceNvlsWithCopy::initAllreduceContext(std::shared_ptr comm, const void*, - void*, size_t, DataType) { +std::shared_ptr AllreduceNvlsWarpPipeline::initAllreduceContext(std::shared_ptr comm, const void*, + void*, size_t, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // setup channels - ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); ctx->switchChannels = - setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); + setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_); ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } -std::shared_ptr AllreduceNvlsWithCopy::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); +std::shared_ptr AllreduceNvlsWarpPipeline::build() { + auto self = + std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); return std::make_shared( - "default_allreduce_nvls_with_copy", "allreduce", + "default_allreduce_nvls_warp_pipeline", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu similarity index 69% rename from src/ext/collectives/allreduce/allreduce_nvls.cu rename to src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index 98f884f8..e7f2028f 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -3,7 +3,7 @@ #include -#include "allreduce/allreduce_nvls.hpp" +#include "allreduce/allreduce_nvls_zero_copy.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" #include "debug.h" @@ -11,6 +11,8 @@ namespace mscclpp { namespace collective { +constexpr int MAX_NBLOCKS = 32; + template __global__ void __launch_bounds__(1024, 1) allreduceNvls([[maybe_unused]] mscclpp::DeviceHandle* memoryChannels, @@ -23,9 +25,18 @@ __global__ void __launch_bounds__(1024, 1) int nBlocks = gridDim.x; int bid = blockIdx.x; size_t sizePerRank = size / nRanksPerNode; - size_t sizePerBlock = sizePerRank / nBlocks; + const size_t minAlign = 16; + // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore + size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks; + sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign; + size_t rankOffset = sizePerRank * rank; size_t blockOffset = sizePerBlock * bid + rankOffset; + size_t curBlockSize = 0; + if (sizePerBlock * bid < sizePerRank) { + curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid); + } + mscclpp::DeviceHandle* multicastPtr = multicast + bid; mscclpp::DeviceHandle* multicastOutPtr = multicastOut + bid; @@ -44,8 +55,10 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; - handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock, - threadIdx.x, blockDim.x); + if (curBlockSize > 0) { + handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize, + threadIdx.x, blockDim.x); + } __syncthreads(); if (threadIdx.x < nPeers) { channels[threadIdx.x].relaxedSignal(); @@ -54,15 +67,22 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct NvlsAdapter { static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*, mscclpp::DeviceHandle* nvlsChannels, mscclpp::DeviceHandle* nvlsOutChannels, size_t channelInOffset, size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) + if constexpr (std::is_same_v) { + return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. + return cudaErrorNotSupported; + } else #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000) - if constexpr (std::is_same_v || std::is_same_v) { + if constexpr (std::is_same_v || std::is_same_v) { return cudaErrorNotSupported; } else #endif @@ -77,7 +97,12 @@ struct NvlsAdapter { }; void AllreduceNvls::initialize(std::shared_ptr comm) { - nSwitchChannels_ = 8; + int device; + MSCCLPP_CUDATHROW(cudaGetDevice(&device)); + cudaDeviceProp deviceProp; + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); + computeCapabilityMajor_ = deviceProp.major; + nSwitchChannels_ = 32; this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -85,14 +110,21 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { // setup base memory channels this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nSwitchChannels_); this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_); + this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); + this->nvlsOutConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + [[maybe_unused]] const std::unordered_map& extras, + mscclpp::DataType accumDtype) { + if (!symmetricMemory_) { + WARN("AllreduceNvls requires symmetric memory for now."); + return CommResult::CommInvalidArgument; + } auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -110,12 +142,22 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { - numBlocksAndThreads = {ctx->nRanksPerNode, 1024}; + numBlocksAndThreads = {::min(ctx->nRanksPerNode, MAX_NBLOCKS), 1024}; + // For GB200 devices with MNNVLS (Multi-Node NVLink Sharp), scale the number of blocks inversely with + // the number of GPUs. Empirically, 32 blocks works well for 4 GPUs and 16 for 8 GPUs, which + // follows the formula 128 / nGPUs, clamped to [1, MAX_NBLOCKS]. + if (computeCapabilityMajor_ == 10) { + numBlocksAndThreads.first = ::max(1, ::min(128 / ctx->workSize, MAX_NBLOCKS)); + } + } + if (numBlocksAndThreads.first > MAX_NBLOCKS) { + WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS); + return CommResult::CommInvalidArgument; } cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, - inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -124,7 +166,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t, - mscclpp::DataType) { + mscclpp::DataType, bool symmetricMemory) { + symmetricMemory_ = symmetricMemory; size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); @@ -145,13 +188,11 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptrnvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); - ctx->switchChannels = setupNvlsChannels(ctx->nvlsConnections, (void*)sendBasePtr, sendBytes, nSwitchChannels_); + ctx->switchChannels = setupNvlsChannels(this->nvlsConnections_, (void*)sendBasePtr, sendBytes, nSwitchChannels_); if (input != output) { - auto nvlsOutConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); + auto nvlsOutConnections = this->nvlsOutConnections_; std::vector outChannels = - setupNvlsChannels(nvlsOutConnections, (void*)recvBasePtr, recvBytes, nSwitchChannels_); - ctx->nvlsConnections.insert(ctx->nvlsConnections.end(), nvlsOutConnections.begin(), nvlsOutConnections.end()); + setupNvlsChannels(this->nvlsOutConnections_, (void*)recvBasePtr, recvBytes, nSwitchChannels_); ctx->switchChannels.insert(ctx->switchChannels.end(), outChannels.begin(), outChannels.end()); } @@ -162,19 +203,22 @@ std::shared_ptr AllreduceNvls::initAllreduceContext(std::shared_ptr AllreduceNvls::build() { auto self = std::make_shared(); return std::make_shared( - "default_allreduce_nvls", "allreduce", + "default_allreduce_nvls_zero_copy", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index d150c717..e2d8ef73 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -2,22 +2,21 @@ // Licensed under the MIT License. #include +#include #include "allreduce/allreduce_packet.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" -#include "debug.h" +#include "logger.hpp" namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; - -template +template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, - size_t nelems, void* flags, uint32_t numScratchBuff + size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff #if defined(ENABLE_NPKIT) , NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) { @@ -60,11 +59,7 @@ __global__ void __launch_bounds__(1024, 1) const int nPeers = nRanksPerNode - 1; const size_t nPkts = nelems / 2; - uint32_t flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0; int nelemsPerRank = nelems / worldSize; @@ -98,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1) // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = src[idx]; - for (int index = 0; index < nPeers; index++) { - const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; - uint2 val = dstPkt[idx].read(flag); - data.x = cal_vectors(val.x, data.x); - data.y = cal_vectors(val.y, data.y); + { + // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path. + using AccRaw = std::conditional_t, uint32_t, + mscclpp::VectorType>; + AccRaw accX = mscclpp::upcastVector(data.x); + AccRaw accY = mscclpp::upcastVector(data.y); + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + uint2 val = dstPkt[idx].read(flag); + accX = mscclpp::calVectorAccum(accX, val.x); + accY = mscclpp::calVectorAccum(accY, val.y); + } + data.x = mscclpp::downcastVector(accX); + data.y = mscclpp::downcastVector(accY); } dst[idx].x = data.x; @@ -129,15 +133,12 @@ __global__ void __launch_bounds__(1024, 1) result[idx].y = data.y; } - // Make sure all threadblocks have finished reading before incrementing the flag - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; + if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \ defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT) @@ -151,25 +152,27 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, - cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0, - int nThreadsPerBlock = 0) { + cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff, + int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); + // Optimize the number of blocks to be multiple of (worldSize - 1) + nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1); #if defined(ENABLE_NPKIT) size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; - allreducePacket<<>>( + allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(), + nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else - allreducePacket<<>>( + allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, numScratchBuff); + nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif return cudaGetLastError(); } @@ -193,18 +196,22 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int } } -#if defined(__FP8_TYPES_EXIST__) // FP8-specific tuning for 32KB-256KB range - if (dtype == DataType::FP8_E4M3 || dtype == DataType::FP8_E5M2) { - if (inputSize < (64 << 10)) { - nThreadsPerBlock = 64; - } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) { - nThreadsPerBlock = 128; - } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) { - nThreadsPerBlock = 256; + { + bool isFp8 = dtype == DataType::FLOAT8_E4M3B15; +#if defined(__FP8_TYPES_EXIST__) + isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2; +#endif + if (isFp8) { + if (inputSize < (64 << 10)) { + nThreadsPerBlock = 64; + } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) { + nThreadsPerBlock = 128; + } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) { + nThreadsPerBlock = 256; + } } } -#endif #endif return {nBlocks, nThreadsPerBlock}; } @@ -215,13 +222,13 @@ void AllreducePacket::initialize(std::shared_ptr comm) { RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory); registeredMemories_.push_back(scratchMemory); - flags_ = detail::gpuCallocShared(maxBlockNum_); } CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { @@ -233,18 +240,19 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); size_t channelInOffset = (char*)input - (char*)sendBasePtr; - void* flags = this->flags_.get(); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { - WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast(dtype)); + WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), + ", dtype=", static_cast(dtype)); return CommResult::CommInvalidArgument; } cudaError_t error = allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, - stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); + stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, + blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error)); + WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; @@ -274,7 +282,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptr AllreducePacket::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); + auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_, + flagBuffer_, flagBufferSize_); return std::make_shared( "default_allreduce_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu new file mode 100644 index 00000000..db471b93 --- /dev/null +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -0,0 +1,230 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "allreduce/allreduce_rsag.hpp" +#include "allreduce/common.hpp" +#include "collective_utils.hpp" +#include "logger.hpp" + +namespace mscclpp { +namespace collective { + +// Allreduce using the Reduce-Scatter + All-Gather (RSAG) pattern. +// +// This algorithm performs allreduce in three phases over intra-node peers +// connected via CudaIpc memory channels: +// +// 1. Scatter: Each rank copies its input data into a scratch buffer, then +// signals peers and waits for all peers to do the same. +// +// 2. Reduce-Scatter: Each rank reduces its assigned chunk by reading the +// corresponding chunks from all peers' scratch buffers (via remote memory +// handles) and applying the reduction op. The reduced result is written +// back to both the local result buffer and peers' scratch buffers. +// +// 3. All-Gather: After a second signal/wait barrier, each rank copies the +// reduced chunks produced by other ranks from the scratch buffer into its +// result buffer, completing the allreduce. +// +// Data is processed in int4-sized (16-byte) units for coalesced memory access, +// with special handling for any remainder elements at the tail. +template +__global__ void __launch_bounds__(1024, 1) + allreduceRsAg(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, + DeviceHandle* switchChannels, void* remoteMemories, int rank, int nRanksPerNode, + int worldSize, size_t nelems) { + int blockId = blockIdx.x; + uint32_t nPeers = nRanksPerNode - 1; + + assert((uintptr_t)buff % sizeof(int4) == 0); + assert((uintptr_t)resultBuff % sizeof(int4) == 0); + + constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); + uint32_t alignedNelems = ((nelems + nRanksPerNode - 1) / nRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * nRanksPerNode; + uint32_t nelemsPerRank = alignedNelems / nRanksPerNode; + uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; + uint32_t lastInt4Index = nelems / nelemsPerInt4; + uint32_t remainder = nelems % nelemsPerInt4; + + int4* scratch4 = reinterpret_cast((char*)scratch); + int4* resultBuff4 = reinterpret_cast((char*)resultBuff); + int4* buff4 = reinterpret_cast((char*)buff); + DeviceHandle* memoryChannelsLocal = memoryChannels + blockId * nPeers; + + uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x; + uint32_t remainderForBlock = nInt4PerRank % gridDim.x; + uint32_t offset4 = blockId * nInt4PerBlock; + if (blockId == (int)(gridDim.x - 1)) { + nInt4PerBlock += remainderForBlock; + } + if (nInt4PerBlock == 0) return; + uint32_t nInt4ForCopy = nInt4PerBlock * nRanksPerNode; + + for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) { + int rankIdx = idx / nInt4PerBlock; + uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock); + if (offsetIdx > lastInt4Index) continue; + if (offsetIdx == lastInt4Index && remainder != 0) { + for (uint32_t i = 0; i < remainder; i++) { + ((T*)&scratch4[offsetIdx])[i] = ((T*)&buff4[offsetIdx])[i]; + } + continue; + } + scratch4[offsetIdx] = buff4[offsetIdx]; + } + __syncthreads(); + if (threadIdx.x < nPeers) { + memoryChannelsLocal[threadIdx.x].signal(); + memoryChannelsLocal[threadIdx.x].wait(); + } + __syncthreads(); + for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) { + uint32_t offset = idx + offset4 + rank * nInt4PerRank; + if (offset > lastInt4Index) continue; + int4 tmp = scratch4[offset]; + for (uint32_t i = 0; i < nPeers; i++) { + int rankIdx = (rank + i + 1) % nRanksPerNode; + int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; + int4 data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); + tmp = calVector(data, tmp); + } + for (uint32_t i = 0; i < nPeers; i++) { + int rankIdx = (rank + i + 1) % nRanksPerNode; + int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; + mscclpp::write(((void**)remoteMemories)[peerIdx], offset, tmp); + } + if (offset == lastInt4Index && remainder != 0) { + for (uint32_t i = 0; i < remainder; i++) { + ((T*)&resultBuff4[offset])[i] = ((T*)&tmp)[i]; + } + continue; + } + resultBuff4[offset] = tmp; + } + __syncthreads(); + if (threadIdx.x < nPeers) { + memoryChannelsLocal[threadIdx.x].signal(); + memoryChannelsLocal[threadIdx.x].wait(); + } + __syncthreads(); + for (uint32_t idx = threadIdx.x; idx < nInt4ForCopy; idx += blockDim.x) { + int rankIdx = idx / nInt4PerBlock; + if (rankIdx == rank) continue; + uint32_t offsetIdx = rankIdx * nInt4PerRank + offset4 + (idx % nInt4PerBlock); + if (offsetIdx > lastInt4Index) continue; + if (offsetIdx == lastInt4Index && remainder != 0) { + for (uint32_t i = 0; i < remainder; i++) { + ((T*)&resultBuff4[offsetIdx])[i] = ((T*)&scratch4[offsetIdx])[i]; + } + continue; + } + resultBuff4[offsetIdx] = scratch4[offsetIdx]; + } +} + +template +struct AllreduceRsAgAdapter { + static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, + DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, + size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, + void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + using ChannelType = DeviceHandle; + size_t nelems = inputSize / sizeof(T); + if (nBlocks == 0 || nThreadsPerBlock == 0) { + nThreadsPerBlock = 1024; + nBlocks = 64; + } + allreduceRsAg<<>>( + (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, + nRanksPerNode, worldSize, nelems); + return cudaGetLastError(); + } +}; + +void AllreduceRsAg::initialize(std::shared_ptr comm) { + this->conns_ = setupConnections(comm); + nChannelsPerConnection_ = 64; + comm_ = comm; + // setup semaphores + this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); + RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); + this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory); + localScratchMemory_ = std::move(localMemory); + + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_); + this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_); + std::vector remoteMemoryHandles; + for (const auto& remoteMemory : this->remoteScratchMemories_) { + remoteMemoryHandles.push_back(remoteMemory.data()); + } + this->remoteMemoryHandles_ = detail::gpuCallocShared(remoteMemoryHandles.size()); + gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(), + cudaMemcpyHostToDevice); +} + +CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, + size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, + int nBlocks, int nThreadsPerBlock, + const std::unordered_map&, DataType accumDtype) { + auto algoCtx = std::static_pointer_cast(ctx); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); + if (!allreduce) { + WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), + ", dtype=", static_cast(dtype)); + return CommResult::CommInvalidArgument; + } + if (inputSize > this->scratchBufferSize_) { + WARN(ALGO, "Input size ", inputSize, " exceeds scratch buffer size ", this->scratchBufferSize_); + return CommResult::CommInvalidArgument; + } + std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), + this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, 0, algoCtx->rank, + algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, 0, + numBlocksAndThreads.first, numBlocksAndThreads.second); + if (error != cudaSuccess) { + WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); + return CommResult::CommUnhandledCudaError; + } + return CommResult::CommSuccess; +} + +AlgorithmCtxKey AllreduceRsAg::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { + return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; +} + +std::shared_ptr AllreduceRsAg::initAllreduceContext(std::shared_ptr comm, const void*, void*, + size_t, DataType) { + auto ctx = std::make_shared(); + ctx->rank = comm->bootstrap()->getRank(); + ctx->workSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + + ctx->memorySemaphores = this->scratchSemaphores_; + ctx->registeredMemories = this->remoteScratchMemories_; + return ctx; +} + +std::shared_ptr AllreduceRsAg::build() { + auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_); + return std::make_shared( + "default_allreduce_rsag", "allreduce", + [self](std::shared_ptr comm) { self->initialize(comm); }, + [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { + return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, + extras, accumDtype); + }, + [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, + DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); +} +} // namespace collective +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu new file mode 100644 index 00000000..eabe3dc5 --- /dev/null +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -0,0 +1,337 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "allreduce/allreduce_rsag_pipeline.hpp" +#include "allreduce/common.hpp" +#include "collective_utils.hpp" +#include "logger.hpp" + +namespace mscclpp { +namespace collective { +constexpr int MAX_NBLOCKS_FOR_PUT = 32; +constexpr int MAX_NBLOCKS_FOR_RECV = 32; +constexpr int MAX_NBLOCKS_FOR_REDUCE = 64; +constexpr int REDUCE_COPY_RATIO = 2; +__device__ DeviceSemaphore semaphoreForSend[MAX_NBLOCKS_FOR_REDUCE]; +__device__ DeviceSemaphore semaphoreForRecv[MAX_NBLOCKS_FOR_REDUCE]; +__device__ DeviceSemaphore semaphoreForReduce[MAX_NBLOCKS_FOR_REDUCE]; + +// TODO: move it to a common header file +template +__device__ __forceinline__ int4 loadVec(const T* buff, size_t i, size_t nelems) { + constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T); + size_t offset = i * ElemsPerInt4; + if (offset + ElemsPerInt4 <= nelems) { + return reinterpret_cast(buff)[i]; + } else { + union { + int4 i; + T t[ElemsPerInt4]; + } vec; + vec.i = make_int4(0, 0, 0, 0); + for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) { + vec.t[j] = buff[offset + j]; + } + return vec.i; + } +} + +template +__device__ __forceinline__ void storeVec(T* buff, size_t i, int4 val, size_t nelems) { + constexpr size_t ElemsPerInt4 = sizeof(int4) / sizeof(T); + size_t offset = i * ElemsPerInt4; + if (offset + ElemsPerInt4 <= nelems) { + reinterpret_cast(buff)[i] = val; + } else { + union { + int4 i; + T t[ElemsPerInt4]; + } vec; + vec.i = val; + for (size_t j = 0; j < ElemsPerInt4 && offset + j < nelems; ++j) { + buff[offset + j] = vec.t[j]; + } + } +} + +// Pipelined Reduce-Scatter + All-Gather (RSAG) allreduce. +// +// This is a pipelined variant of the basic RSAG allreduce that overlaps +// communication and computation by splitting the data into chunks processed +// across multiple iterations. Three groups of thread blocks run concurrently +// with different roles, synchronized via device semaphores: +// +// PUT blocks — Read local input chunks and write them into peers' scratch +// buffers via remote memory handles (CudaIpc). +// +// REDUCE blocks — After a signal/wait barrier confirming PUT completion, +// reduce the local chunk with data received from all peers +// in the scratch buffer. Write the reduced result to both +// the local output and peers' scratch (for the AG phase). +// +// RECV blocks — After a signal/wait barrier confirming REDUCE completion, +// copy other ranks' reduced chunks from scratch into the +// local result buffer, completing the all-gather. +// +// Pipelining is achieved by using a circular scratch buffer (pipelineDepth +// stages). PUT blocks wait on a semaphore before reusing a scratch slot, +// allowing the next iteration's PUT to overlap with the current iteration's +// REDUCE and RECV. Each REDUCE block handles a subset of the PUT block's +// data (controlled by REDUCE_COPY_RATIO), enabling finer-grained overlap. +// +// Data is processed in int4-sized (16-byte) units with vectorized load/store +// helpers that handle tail elements. + +template +__global__ void __launch_bounds__(1024, 1) + allreduceRsAgPipeline(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, + DeviceHandle* switchChannels, void* remoteMemories, int rank, + int nRanksPerNode, int worldSize, size_t nelems, size_t scratchSize, uint32_t nblocksForPut, + uint32_t nblocksForReduce, uint32_t nblocksForRecv) { + uint32_t bid = blockIdx.x; + constexpr uint32_t nStepsPerIter = 4; + uint32_t nInt4 = (nelems * sizeof(T) + sizeof(int4) - 1) / sizeof(int4); + uint32_t nInt4PerIter = nblocksForReduce * blockDim.x * nStepsPerIter; + const uint32_t chunkSize = nInt4PerIter * worldSize; + uint32_t nIters = (nInt4 + chunkSize - 1) / chunkSize; + uint32_t nPeers = nRanksPerNode - 1; + int4* scratch4 = reinterpret_cast((char*)scratch); + const uint32_t scratchIterStride = 2 * chunkSize; // one for AS, one for AG + const uint32_t pipelineDepth = scratchSize / sizeof(int4) / scratchIterStride; + assert(pipelineDepth >= 1); + + if (bid < nblocksForPut) { + if (threadIdx.x == 0) { + semaphoreForSend[bid].set(pipelineDepth); + } + for (uint32_t iter = 0; iter < nIters; iter++) { + if (threadIdx.x == 0) { + semaphoreForSend[bid].acquire(); + } + __syncthreads(); + uint32_t threadIdInPut = bid * blockDim.x + threadIdx.x; + for (uint32_t peer = 0; peer < nPeers; peer++) { + int remoteRankId = (rank + peer + 1) % nRanksPerNode; + int peerId = remoteRankId < rank ? remoteRankId : remoteRankId - 1; + // Read chunk[remoteRankId] from local buff, write to peer's scratch[rank] (sender's slot) + uint32_t srcOffset = iter * chunkSize + remoteRankId * nInt4PerIter; + uint32_t dstOffset = (iter % pipelineDepth) * scratchIterStride + rank * nInt4PerIter; + int4 tmp[nStepsPerIter * REDUCE_COPY_RATIO]; +#pragma unroll + for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { + uint32_t offset = srcOffset + threadIdInPut + step * blockDim.x * nblocksForPut; + tmp[step] = loadVec(buff, offset, nelems); + } +#pragma unroll + for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { + uint32_t offset = dstOffset + threadIdInPut + step * blockDim.x * nblocksForPut; + mscclpp::write(((void**)remoteMemories)[peerId], offset, tmp[step]); + } + } + __syncthreads(); + if (threadIdx.x < REDUCE_COPY_RATIO) { + semaphoreForReduce[bid * REDUCE_COPY_RATIO + threadIdx.x].release(); + } + } + } else if (bid < nblocksForPut + nblocksForReduce) { + uint32_t bidInReduce = bid - nblocksForPut; + DeviceHandle* localMemoryChannels = memoryChannels + bidInReduce * nPeers; + // Map REDUCE blocks to PUT blocks: REDUCE blocks 0,1 handle PUT block 0's data + uint32_t putBlockId = bidInReduce / REDUCE_COPY_RATIO; + uint32_t subBlockId = bidInReduce % REDUCE_COPY_RATIO; + for (uint32_t iter = 0; iter < nIters; iter++) { + if (threadIdx.x == 0) { + semaphoreForReduce[bidInReduce].acquire(); + } + uint32_t baseOffset = (iter % pipelineDepth) * scratchIterStride; + uint32_t baseSrcOffset = iter * chunkSize; + + // Use same thread mapping as PUT: putBlockId * blockDim.x + threadIdx.x + uint32_t threadIdInPut = putBlockId * blockDim.x + threadIdx.x; + __syncthreads(); + if (threadIdx.x < nPeers) { + localMemoryChannels[threadIdx.x].signal(); + localMemoryChannels[threadIdx.x].wait(); + } + __syncthreads(); +#pragma unroll nStepsPerIter + for (uint32_t step = 0; step < nStepsPerIter; step++) { + // Map to PUT's step pattern: each REDUCE block handles nStepsPerIter steps + // subBlockId determines which subset of the REDUCE_COPY_RATIO * nStepsPerIter steps + uint32_t putStep = subBlockId * nStepsPerIter + step; + uint32_t myChunkOffset = + baseSrcOffset + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; + int4 tmp = loadVec(buff, myChunkOffset, nelems); + // Add data from each peer's slot in scratch (peer sent their chunk[rank] to our scratch[peer]) + for (uint32_t peer = 0; peer < nPeers; peer++) { + int remoteRankId = (rank + peer + 1) % nRanksPerNode; + uint32_t peerSlotOffset = + baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; + int4 data = scratch4[peerSlotOffset]; + tmp = calVector(data, tmp); + } + storeVec(resultBuff, myChunkOffset, tmp, nelems); + // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter + uint32_t dstOffset = + baseOffset + chunkSize + rank * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; + for (uint32_t i = 0; i < nPeers; i++) { + int peerIdx = (rank + i + 1) % nRanksPerNode; + int index = peerIdx < rank ? peerIdx : peerIdx - 1; + mscclpp::write(((void**)remoteMemories)[index], dstOffset, tmp); + } + } + __syncthreads(); + if (threadIdx.x == 0) { + semaphoreForRecv[bidInReduce].release(); + } + } + } else if (bid < nblocksForPut + nblocksForReduce + nblocksForRecv) { + uint32_t bidInRecv = bid - nblocksForPut - nblocksForReduce; + DeviceHandle* localMemoryChannels = memoryChannels + (nblocksForReduce + bidInRecv) * nPeers; + for (uint32_t iter = 0; iter < nIters; iter++) { + if (threadIdx.x < REDUCE_COPY_RATIO) { + semaphoreForRecv[bidInRecv * REDUCE_COPY_RATIO + threadIdx.x].acquire(); + } + uint32_t baseOffset = scratchIterStride * (iter % pipelineDepth); + uint32_t baseDstOffset = chunkSize * iter; + int threadIdInRecv = bidInRecv * blockDim.x + threadIdx.x; + __syncthreads(); + if (threadIdx.x < nPeers) { + localMemoryChannels[threadIdx.x].signal(); + localMemoryChannels[threadIdx.x].wait(); + } + __syncthreads(); + // Copy other ranks' reduced chunks from scratch to result + for (uint32_t peer = 0; peer < nPeers; peer++) { + int remoteRankId = (rank + peer + 1) % nRanksPerNode; + for (uint32_t step = 0; step < nStepsPerIter * REDUCE_COPY_RATIO; step++) { + uint32_t offset = baseOffset + chunkSize + remoteRankId * nInt4PerIter + threadIdInRecv + + step * blockDim.x * nblocksForRecv; + uint32_t dstOffset = + baseDstOffset + remoteRankId * nInt4PerIter + threadIdInRecv + step * blockDim.x * nblocksForRecv; + storeVec(resultBuff, dstOffset, scratch4[offset], nelems); + } + } + __syncthreads(); + if (threadIdx.x == 0) { + semaphoreForSend[bidInRecv].release(); + } + } + } +} + +template +struct AllreduceRsAgPipelineAdapter { + static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, + DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, + size_t scratchSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + using ChannelType = DeviceHandle; + size_t nelems = inputSize / sizeof(T); + uint32_t nblocksForPut = MAX_NBLOCKS_FOR_PUT; + uint32_t nblocksForReduce = MAX_NBLOCKS_FOR_REDUCE; + uint32_t nblocksForRecv = MAX_NBLOCKS_FOR_RECV; + int maxNblocks = nblocksForPut + nblocksForReduce + nblocksForRecv; + if (nBlocks == 0 || nThreadsPerBlock == 0) { + nThreadsPerBlock = 1024; + nBlocks = maxNblocks; + } else { + nBlocks = nBlocks / (REDUCE_COPY_RATIO + 2) * (REDUCE_COPY_RATIO + 2); + if (nBlocks > maxNblocks) { + WARN(ALGO, "The number of blocks is too large for the allreduce pipeline algorithm, reducing it to ", + maxNblocks); + nBlocks = maxNblocks; + } + nblocksForPut = nBlocks / (REDUCE_COPY_RATIO + 2); + nblocksForReduce = nblocksForPut * REDUCE_COPY_RATIO; + nblocksForRecv = nblocksForPut; + } + allreduceRsAgPipeline<<>>( + (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, + nRanksPerNode, worldSize, nelems, scratchSize, nblocksForPut, nblocksForReduce, nblocksForRecv); + return cudaGetLastError(); + } +}; + +void AllreduceRsAgPipeline::initialize(std::shared_ptr comm) { + this->conns_ = setupConnections(comm); + nChannelsPerConnection_ = MAX_NBLOCKS_FOR_REDUCE + MAX_NBLOCKS_FOR_RECV; + comm_ = comm; + // setup semaphores + this->scratchSemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); + RegisteredMemory localMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); + this->remoteScratchMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), localMemory); + localScratchMemory_ = std::move(localMemory); + + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->scratchSemaphores_, nChannelsPerConnection_); + this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_); + std::vector remoteMemoryHandles; + for (const auto& remoteMemory : this->remoteScratchMemories_) { + remoteMemoryHandles.push_back(remoteMemory.data()); + } + this->remoteMemoryHandles_ = detail::gpuCallocShared(remoteMemoryHandles.size()); + gpuMemcpy(this->remoteMemoryHandles_.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(), + cudaMemcpyHostToDevice); +} + +CommResult AllreduceRsAgPipeline::allreduceKernelFunc( + const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, + cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { + auto algoCtx = std::static_pointer_cast(ctx); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); + if (!allreduce) { + WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), + ", dtype=", static_cast(dtype)); + return CommResult::CommInvalidArgument; + } + std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + cudaError_t error = allreduce(input, this->scratchBuffer_, output, this->baseMemoryChannelHandles_.get(), + this->remoteMemoryHandles_.get(), nullptr, nullptr, 0, 0, this->scratchBufferSize_, + algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, nullptr, 0, + 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + if (error != cudaSuccess) { + WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); + return CommResult::CommUnhandledCudaError; + } + return CommResult::CommSuccess; +} + +AlgorithmCtxKey AllreduceRsAgPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { + return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; +} + +std::shared_ptr AllreduceRsAgPipeline::initAllreduceContext(std::shared_ptr comm, const void*, + void*, size_t, DataType) { + auto ctx = std::make_shared(); + ctx->rank = comm->bootstrap()->getRank(); + ctx->workSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + + ctx->memorySemaphores = this->scratchSemaphores_; + ctx->registeredMemories = this->remoteScratchMemories_; + return ctx; +} + +std::shared_ptr AllreduceRsAgPipeline::build() { + auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_); + return std::make_shared( + "default_allreduce_rsag_pipeline", "allreduce", + [self](std::shared_ptr comm) { self->initialize(comm); }, + [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { + return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, + extras, accumDtype); + }, + [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, + DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); +} +} // namespace collective +} // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu new file mode 100644 index 00000000..f95ba7e3 --- /dev/null +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -0,0 +1,247 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include + +#include "allreduce/allreduce_rsag_zero_copy.hpp" +#include "allreduce/common.hpp" +#include "collective_utils.hpp" +#include "logger.hpp" + +namespace mscclpp { +namespace collective { + +__device__ mscclpp::DeviceSyncer globalSyncer; + +// Zero-copy Reduce-Scatter + All-Gather (RSAG) allreduce. +// +// Unlike the standard RSAG which copies input into a scratch buffer first, +// this variant reads directly from peers' input buffers and writes reduced +// results directly to peers' output buffers — eliminating the need for a +// separate scratch buffer and reducing memory traffic. +// +// The algorithm runs in a single kernel with the following steps: +// +// 1. Barrier: Signal and wait on all peers to ensure input buffers are ready. +// +// 2. Reduce-Scatter: Each rank reads its assigned chunk from every peer's +// input buffer (via CudaIpc remote memory handles), reduces all values +// locally, then writes the reduced result to its own output buffer AND +// directly to every peer's output buffer at the same offset. +// +// 3. Global sync + Barrier: A device-wide sync ensures all writes complete, +// followed by a final signal/wait to guarantee all peers have finished +// writing, making the full output buffer valid on every rank. +// +// This approach requires registering both input and output buffers as remote +// memories (2 * nPeers handles), but avoids scratch buffer allocation and +// the extra copy steps of the standard RSAG. The NRanksPerNode template +// parameter enables compile-time unrolling of peer loops (supports 4 or 8). + +template +__global__ void __launch_bounds__(1024, 1) + allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, + DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, + size_t nelems) { + int blockId = blockIdx.x; + + assert((uintptr_t)buff % sizeof(int4) == 0); + assert((uintptr_t)resultBuff % sizeof(int4) == 0); + + constexpr int NPeers = NRanksPerNode - 1; + constexpr uint32_t nelemsPerInt4 = sizeof(int4) / sizeof(T); + const uint32_t outputRemoteBufferOffset = NRanksPerNode - 1; + uint32_t alignedNelems = ((nelems + NRanksPerNode - 1) / NRanksPerNode + nelemsPerInt4 - 1) / nelemsPerInt4 * + nelemsPerInt4 * NRanksPerNode; + uint32_t nelemsPerRank = alignedNelems / NRanksPerNode; + uint32_t nInt4PerRank = nelemsPerRank / nelemsPerInt4; + uint32_t nInt4Total = (nelems + nelemsPerInt4 - 1) / nelemsPerInt4; + + int4* resultBuff4 = reinterpret_cast((char*)resultBuff); + int4* buff4 = reinterpret_cast((char*)buff); + DeviceHandle* memoryChannelsLocal = memoryChannels + blockId * NPeers; + + uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x; + uint32_t remainderForBlock = nInt4PerRank % gridDim.x; + uint32_t offset4 = blockId * nInt4PerBlock; + if (blockId == (int)(gridDim.x - 1)) { + nInt4PerBlock += remainderForBlock; + } + if (nInt4PerBlock == 0) return; + + if (threadIdx.x < NPeers) { + memoryChannelsLocal[threadIdx.x].relaxedSignal(); + memoryChannelsLocal[threadIdx.x].relaxedWait(); + } + __syncthreads(); + int4 data[NPeers]; + // AccumInt4: when AccumT != T, use a wider accumulator type. + // For AccumT == T, this is just int4 (no-op conversion). + constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); + // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path. + using AccumVec = std::conditional_t, int4, mscclpp::VectorType>; + for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) { + uint32_t offset = idx + offset4 + rank * nInt4PerRank; + if (offset >= nInt4Total) continue; + int4 tmp_raw = buff4[offset]; +#pragma unroll + for (int i = 0; i < NPeers; i++) { + int rankIdx = (rank + i + 1) % NRanksPerNode; + int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; + data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); + } + AccumVec acc = mscclpp::upcastVector(tmp_raw); + for (int i = 0; i < NPeers; i++) { + acc = mscclpp::calVectorAccum(acc, data[i]); + } + int4 tmp = mscclpp::downcastVector(acc); +#pragma unroll + for (int i = 0; i < NPeers; i++) { + int rankIdx = (rank + i + 1) % NRanksPerNode; + int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; + mscclpp::write(((void**)remoteMemories)[outputRemoteBufferOffset + peerIdx], offset, tmp); + } + resultBuff4[offset] = tmp; + } + // Use device barrier gives better performance here. + globalSyncer.sync(gridDim.x); + if (blockIdx.x == 0 && threadIdx.x < NPeers) { + memoryChannelsLocal[threadIdx.x].signal(); + memoryChannelsLocal[threadIdx.x].wait(); + } +} + +template +struct AllreduceRsAgZeroCopyAdapter { + static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, + DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, + size_t, int rank, int nRanksPerNode, int worldSize, size_t inputSize, cudaStream_t stream, + void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { + using ChannelType = DeviceHandle; + size_t nelems = inputSize / sizeof(T); + if (nBlocks == 0 || nThreadsPerBlock == 0) { + nThreadsPerBlock = 1024; + nBlocks = 64; + if (inputSize >= (1 << 26)) { + nBlocks = 128; + } + } + if (nRanksPerNode == 4) { + allreduceRsAgZeroCopy<4, OpType, T, AccumT> + <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, + switchChannel, remoteMemories, rank, worldSize, nelems); + } else if (nRanksPerNode == 8) { + allreduceRsAgZeroCopy<8, OpType, T, AccumT> + <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, + switchChannel, remoteMemories, rank, worldSize, nelems); + } else { + THROW(ALGO, Error, ErrorCode::InvalidUsage, "Unsupported number of ranks per node: ", nRanksPerNode); + } + return cudaGetLastError(); + } +}; + +void AllreduceRsAgZeroCopy::initialize(std::shared_ptr comm) { + this->conns_ = setupConnections(comm); + nChannelsPerConnection_ = 128; + comm_ = comm; + // setup semaphores + this->semaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); + this->baseChannels_ = setupBaseMemoryChannels(this->conns_, this->semaphores_, nChannelsPerConnection_); + this->baseMemoryChannelHandles_ = setupBaseMemoryChannelDeviceHandles(baseChannels_); +} + +CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, + size_t inputSize, DataType dtype, ReduceOp op, + cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + const std::unordered_map&, + DataType accumDtype) { + auto algoCtx = std::static_pointer_cast(ctx); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); + if (!allreduce) { + WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), + ", dtype=", static_cast(dtype)); + return CommResult::CommInvalidArgument; + } + std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; + cudaError_t error = + allreduce(input, nullptr, output, this->baseMemoryChannelHandles_.get(), algoCtx->remoteMemoryHandles.get(), + nullptr, nullptr, 0, 0, 0, algoCtx->rank, algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, + nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + if (error != cudaSuccess) { + WARN(ALGO, "Allreduce kernel launch failed with error: ", cudaGetErrorString(error)); + return CommResult::CommUnhandledCudaError; + } + return CommResult::CommSuccess; +} + +AlgorithmCtxKey AllreduceRsAgZeroCopy::generateAllreduceContextKey(const void* inputBuffer, void* outputBuffer, + size_t size, DataType, bool symmetricMemory) { + // For non-symmetric algorithms, we use both input and output buffer pointers in the key. + static int tag = 0; + if (symmetricMemory) { + size_t inputBytes, outputBytes; + CUdeviceptr inputBasePtr, outputBasePtr; + MSCCLPP_CUTHROW(cuMemGetAddressRange(&inputBasePtr, &inputBytes, (CUdeviceptr)inputBuffer)); + MSCCLPP_CUTHROW(cuMemGetAddressRange(&outputBasePtr, &outputBytes, (CUdeviceptr)outputBuffer)); + return AlgorithmCtxKey{(void*)inputBasePtr, (void*)outputBasePtr, inputBytes, outputBytes, 0}; + } + return AlgorithmCtxKey{(void*)inputBuffer, outputBuffer, size, size, ++tag}; +} + +std::shared_ptr AllreduceRsAgZeroCopy::initAllreduceContext(std::shared_ptr comm, const void* input, + void* output, size_t size, DataType) { + auto ctx = std::make_shared(); + ctx->rank = comm->bootstrap()->getRank(); + ctx->workSize = comm->bootstrap()->getNranks(); + ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); + + ctx->memorySemaphores = this->semaphores_; + + // register input and output memories + RegisteredMemory inputMemory = comm->registerMemory((void*)input, size, Transport::CudaIpc); + RegisteredMemory outputMemory = comm->registerMemory(output, size, Transport::CudaIpc); + this->inputMemories_.push_back(inputMemory); + this->outputMemories_.push_back(outputMemory); + + auto remoteInputMemories = setupRemoteMemories(comm, ctx->rank, inputMemory); + auto remoteOutputMemories = setupRemoteMemories(comm, ctx->rank, outputMemory); + ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteInputMemories.begin(), remoteInputMemories.end()); + ctx->registeredMemories.insert(ctx->registeredMemories.end(), remoteOutputMemories.begin(), + remoteOutputMemories.end()); + std::vector remoteMemoryHandles; + for (const auto& remoteMemory : ctx->registeredMemories) { + remoteMemoryHandles.push_back(remoteMemory.data()); + } + ctx->remoteMemoryHandles = detail::gpuCallocShared(remoteMemoryHandles.size()); + gpuMemcpy(ctx->remoteMemoryHandles.get(), remoteMemoryHandles.data(), remoteMemoryHandles.size(), + cudaMemcpyHostToDevice); + + // store local registered memories to ctx for lifetime management + ctx->registeredMemories.push_back(inputMemory); + ctx->registeredMemories.push_back(outputMemory); + return ctx; +} + +std::shared_ptr AllreduceRsAgZeroCopy::build() { + auto self = std::make_shared(); + return std::make_shared( + "default_allreduce_rsag_zero_copy", "allreduce", + [self](std::shared_ptr comm) { self->initialize(comm); }, + [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { + return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, + extras, accumDtype); + }, + [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, + [[maybe_unused]] size_t outputSize, + DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); +} +} // namespace collective +} // namespace mscclpp diff --git a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu index 0ed8ff3c..20e420b2 100644 --- a/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu +++ b/src/ext/collectives/alltoallv/alltoallv_fullmesh.cu @@ -67,7 +67,8 @@ std::shared_ptr AlltoallvFullmesh::build() { [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) { + const std::unordered_map& extras, + [[maybe_unused]] DataType accumDtype) -> CommResult { return self->alltoallvKernelFunc(ctx, input, output, inputSize, outputSize, dtype, stream, nBlocks, nThreadsPerBlock, extras); }, @@ -77,7 +78,8 @@ std::shared_ptr AlltoallvFullmesh::build() { return self->initAlltoallvContext(comm, input, output, inputSize, outputSize, dtype); }, // Context key generation function - [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype) { + [self](const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, + [[maybe_unused]] bool symmetricMemory) { return self->generateAlltoallvContextKey(input, output, inputSize, outputSize, dtype); }); diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp index 085f4ac4..d1a4bbcd 100644 --- a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp +++ b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp @@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder { std::shared_ptr initAllgatherContext(std::shared_ptr comm, const void*, void* output, size_t, mscclpp::DataType); - mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType); + mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp index ea176ba1..56783e3b 100644 --- a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp +++ b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp @@ -11,11 +11,11 @@ namespace collective { class AllgatherFullmesh2 : public AlgorithmBuilder { public: - AllgatherFullmesh2(); + AllgatherFullmesh2() = default; std::shared_ptr build() override; private: - bool disableChannelCache_; + bool symmetricMemory_; std::vector conns_; std::vector> memorySemaphores_; const int nChannelsPerConnection_ = 35; @@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder { std::shared_ptr initAllgatherContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool); }; } // namespace collective diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index e995b940..362308b2 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -9,19 +9,22 @@ namespace mscclpp { namespace collective { class AllreduceAllpairPacket : public AlgorithmBuilder { public: - AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; @@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { std::vector conns_; std::vector> memorySemaphores_; std::vector registeredMemories_; - std::shared_ptr flags_; - std::shared_ptr flags7_; - std::shared_ptr flags28_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp index 31a7f145..a54352b3 100644 --- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp @@ -16,11 +16,11 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; std::shared_ptr comm_; @@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { RegisteredMemory localScratchMemory_; std::unordered_map, std::shared_ptr>>> memoryChannelsMap_; + bool symmetricMemory_ = false; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp similarity index 72% rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp rename to src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 1077b122..81b74add 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -1,14 +1,17 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_ +#define MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_ + #include namespace mscclpp { namespace collective { -class AllreduceNvlsWithCopy : public AlgorithmBuilder { +class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { public: - AllreduceNvlsWithCopy(uintptr_t scratchBuffer, size_t scratchBufferSize) + AllreduceNvlsBlockPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize) : scratchBuffer_(reinterpret_cast(scratchBuffer)), scratchBufferSize_(scratchBufferSize){}; std::shared_ptr build() override; @@ -16,11 +19,11 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); const size_t nvlsBufferSize_ = (1 << 30); void* scratchBuffer_; @@ -29,6 +32,9 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder { std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; + std::vector> nvlsConnections_; }; } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp + +#endif // MSCCLPP_EXT_ALLREDUCE_NVLS_BLOCK_PIPELINE_HPP_ diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp index 8761162a..fb0c63b8 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp @@ -10,27 +10,32 @@ namespace mscclpp { namespace collective { class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder { public: - AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras); + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, mscclpp::DataType); - mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType); + mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; const size_t nvlsBufferSize_ = (1 << 30); const int maxBlockNum_ = 16; - std::shared_ptr flags_; - std::shared_ptr flags4_; - std::shared_ptr flags8_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; + std::vector> nvlsConnections_; + std::vector switchChannels_; }; } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp similarity index 73% rename from src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp rename to src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index 7bfa9822..8f02a873 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -1,17 +1,17 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_ -#define MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_ +#ifndef MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_ +#define MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_ #include namespace mscclpp { namespace collective { -class AllreduceNvlsWithCopy2 : public AlgorithmBuilder { +class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { public: - AllreduceNvlsWithCopy2(uintptr_t scratchBuffer, size_t scratchBufferSize) + AllreduceNvlsWarpPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize) : scratchBuffer_(reinterpret_cast(scratchBuffer)), scratchBufferSize_(scratchBufferSize){}; std::shared_ptr build() override; @@ -19,11 +19,11 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); const size_t nvlsBufferSize_ = (1 << 30); void* scratchBuffer_; @@ -32,8 +32,9 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder { std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; + std::vector> nvlsConnections_; }; } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_EXT_ALLREDUCE_NVLS_WITH_COPY_2_HPP_ \ No newline at end of file +#endif // MSCCLPP_EXT_ALLREDUCE_NVLS_WARP_PIPELINE_HPP_ diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp similarity index 60% rename from src/ext/collectives/include/allreduce/allreduce_nvls.hpp rename to src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index 4591cb42..d53ea180 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -1,6 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#ifndef MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ +#define MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ + #include namespace mscclpp { @@ -12,21 +15,30 @@ class AllreduceNvls : public AlgorithmBuilder { std::shared_ptr build() override; private: + bool symmetricMemory_ = false; void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); - const size_t nvlsBufferSize_ = (1 << 30); + // Large buffer size because cuMemMap requires offset=0 for multicast handles, so the entire + // user allocation must be mapped. This only reserves virtual address space; no physical memory + // is consumed beyond what is actually bound. + const size_t nvlsBufferSize_ = (1UL << 34); uint32_t nSwitchChannels_; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; + std::vector> nvlsConnections_; + std::vector> nvlsOutConnections_; + int computeCapabilityMajor_{0}; }; } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp + +#endif // MSCCLPP_ALLREDUCE_NVLS_ZERO_COPY_HPP_ \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp index f562aca5..de7ca471 100644 --- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp @@ -9,28 +9,32 @@ namespace mscclpp { namespace collective { class AllreducePacket : public AlgorithmBuilder { public: - AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; const int maxBlockNum_ = 56; std::vector conns_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; std::vector> memorySemaphores_; std::vector registeredMemories_; - std::shared_ptr flags_; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp new file mode 100644 index 00000000..1fd663da --- /dev/null +++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_ +#define MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_ + +#include + +namespace mscclpp { +namespace collective { + +class AllreduceRsAg : public mscclpp::AlgorithmBuilder { + public: + AllreduceRsAg(uintptr_t scratchBuffer, size_t scratchBufferSize) + : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + std::shared_ptr build() override; + + private: + void initialize(std::shared_ptr comm); + CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + const std::unordered_map& extras, DataType accumDtype); + + std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, + DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); + void* scratchBuffer_; + size_t scratchBufferSize_; + std::shared_ptr comm_; + int nChannelsPerConnection_; + std::vector conns_; + std::vector> scratchSemaphores_; + std::vector remoteScratchMemories_; + RegisteredMemory localScratchMemory_; + + std::vector baseChannels_; + std::shared_ptr> baseMemoryChannelHandles_; + std::shared_ptr remoteMemoryHandles_; +}; +} // namespace collective +} // namespace mscclpp + +#endif // MSCCLPP_EXT_ALLREDUCE_RSAG_HPP_ \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp new file mode 100644 index 00000000..7629f2fe --- /dev/null +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_ +#define MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_ + +#include + +namespace mscclpp { +namespace collective { + +class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder { + public: + AllreduceRsAgPipeline(uintptr_t scratchBuffer, size_t scratchBufferSize) + : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + std::shared_ptr build() override; + + private: + void initialize(std::shared_ptr comm); + CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + const std::unordered_map& extras, DataType accumDtype); + + std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, + DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); + void* scratchBuffer_; + size_t scratchBufferSize_; + std::shared_ptr comm_; + int nChannelsPerConnection_; + std::vector conns_; + std::vector> scratchSemaphores_; + std::vector remoteScratchMemories_; + RegisteredMemory localScratchMemory_; + + std::vector baseChannels_; + std::shared_ptr> baseMemoryChannelHandles_; + std::shared_ptr remoteMemoryHandles_; +}; +} // namespace collective +} // namespace mscclpp + +#endif // MSCCLPP_EXT_ALLREDUCE_RSAG_PIPELINE_HPP_ \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp new file mode 100644 index 00000000..05bf2ef3 --- /dev/null +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_ +#define MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_ + +#include + +namespace mscclpp { +namespace collective { + +class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder { + public: + AllreduceRsAgZeroCopy() = default; + std::shared_ptr build() override; + + private: + void initialize(std::shared_ptr comm); + CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, + DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + const std::unordered_map& extras, DataType accumDtype); + + std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, + DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); + std::shared_ptr comm_; + int nChannelsPerConnection_; + std::vector conns_; + std::vector> semaphores_; + std::vector inputMemories_; + std::vector outputMemories_; + + std::vector baseChannels_; + std::shared_ptr> baseMemoryChannelHandles_; +}; +} // namespace collective +} // namespace mscclpp + +#endif // MSCCLPP_EXT_ALLREDUCE_RSAG_ZERO_COPY_HPP_ \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 10eecf7e..1e0e7e69 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_ -#define MSCCLPP_ALLREDUCE_COMMOM_HPP_ +#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_ +#define MSCCLPP_ALLREDUCE_COMMON_HPP_ #include #include @@ -10,6 +10,8 @@ #include #include +#include "reduce_kernel.hpp" + #if defined(ENABLE_NPKIT) #include #endif @@ -22,438 +24,6 @@ constexpr ReduceOp MIN = ReduceOp::MIN; #if defined(MSCCLPP_DEVICE_COMPILE) -template -__forceinline__ __device__ To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -__forceinline__ __device__ T clip(T val) { - return val; -} - -template <> -__forceinline__ __device__ __half clip(__half val) { - val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); - val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); - - return val; -} - -template <> -__forceinline__ __device__ __half2 clip(__half2 val) { - val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); - val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); - val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); - val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); - return val; -} - -template <> -__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) { - val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); - val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); - return val; -} - -template <> -__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) { - val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80)); - val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80)); - val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80)); - val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80)); - return val; -} - -template -__forceinline__ __device__ T add_elements(T a, T b) { - if constexpr (UseClip) { - return clip(a + b); - } else { - return a + b; - } -} - -template -__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { - if constexpr (UseClip) { - return clip(__hadd2(a, b)); - } else { - return __hadd2(a, b); - } -} - -template -__forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { - if constexpr (UseClip) { - return clip(__hadd2(a, b)); - } else { - return __hadd2(a, b); - } -} - -template -__forceinline__ __device__ T min_elements(T a, T b) { - return (a < b ? a : b); -} - -template <> -__forceinline__ __device__ __half2 min_elements(__half2 a, __half2 b) { -#if defined(__HIP_PLATFORM_AMD__) - __half2 val; - val.x = __hmin(a.x, b.x); - val.y = __hmin(a.y, b.y); - return val; -#else - return __hmin2(a, b); -#endif -} - -template <> -__forceinline__ __device__ __bfloat162 min_elements(__bfloat162 a, __bfloat162 b) { - return __hmin2(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -// FP8 E4M3 clipping function -template <> -__forceinline__ __device__ __fp8_e4m3 clip(__fp8_e4m3 val) { - // FP8 E4M3 has range [-448, 448], no infinities - // Built-in saturation in FP8 arithmetic - return val; -} - -// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value -template <> -__forceinline__ __device__ __fp8_e5m2 clip(__fp8_e5m2 val) { - // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow - // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB) - float fval = float(val); - fval = fmaxf(fval, -57344.0f); - fval = fminf(fval, 57344.0f); - return __fp8_e5m2(fval); -} - -// FP8 E4M3 addition using __hadd for efficiency (single element) -template -__forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - // Optimized assembly for gfx942 - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // NVIDIA CUDA FP8 addition (CUDA 11.8+) - __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b))); - return UseClip ? clip(result) : result; -#else - // Fallback for non-gfx942 HIP platforms - __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b)); - return UseClip ? clip(result) : result; -#endif -} - -// FP8 E4M3 vectorized addition for 2 elements -template -__forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // CUDA: Convert to half2, add using optimized __hadd2, convert back - __fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b))); - return result; -#else - // Fallback for non-gfx942 HIP: element-wise using single-element operations - union { - __fp8_e4m3 fp8[2]; - __fp8x2_e4m3 fp8x2; - } ua, ub, result; - ua.fp8x2 = a; - ub.fp8x2 = b; - result.fp8[0] = add_elements(ua.fp8[0], ub.fp8[0]); - result.fp8[1] = add_elements(ua.fp8[1], ub.fp8[1]); - return result.fp8x2; -#endif -} - -// FP8 E4M3 vectorized addition for 4 elements (via 2x __fp8x2_e4m3) -template -__forceinline__ __device__ __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - // Process as two __fp8x2_e4m3 using add_elements for 2 elements - __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a); - __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b); - - __fp8x2_e4m3 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e4m3*>(result); -} - -// FP8 E5M2 addition using __hadd for efficiency (single element) -template -__forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - // Optimized assembly for gfx942 (bfloat8) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // NVIDIA CUDA FP8 addition - __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b))); - return UseClip ? clip(result) : result; -#else - // Fallback for non-gfx942 HIP platforms - __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b)); - return UseClip ? clip(result) : result; -#endif -} - -#if !defined(__HIP_PLATFORM_AMD__) -// FP8 E5M2 vectorized addition for 2 elements (CUDA only) -template -__forceinline__ __device__ __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - // CUDA: Convert to half2, add using optimized __hadd2, convert back - __fp8x2_e5m2 result = __fp8x2_e5m2(__hadd2(__half2(a), __half2(b))); - return result; -} - -// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2) -template -__forceinline__ __device__ __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - // Process as two __fp8x2_e5m2 using add_elements for 2 elements - __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a); - __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b); - - __fp8x2_e5m2 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e5m2*>(result); -} -#endif // !defined(__HIP_PLATFORM_AMD__) - -// FP8 E4M3 min operation (single element) -template <> -__forceinline__ __device__ __fp8_e4m3 min_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) - return __fp8_e4m3(fminf(float(a), float(b))); -#else - return __fp8_e4m3(__hmin(__half(a), __half(b))); -#endif -} - -// FP8 E4M3 vectorized min for 2 elements -__forceinline__ __device__ __fp8x2_e4m3 min_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) - // HIP implementation: use union and process element-wise - union { - __fp8_e4m3 fp8[2]; - __fp8x2_e4m3 fp8x2; - } ua, ub, result; - ua.fp8x2 = a; - ub.fp8x2 = b; - result.fp8[0] = min_elements(ua.fp8[0], ub.fp8[0]); - result.fp8[1] = min_elements(ua.fp8[1], ub.fp8[1]); - return result.fp8x2; -#else - return __fp8x2_e4m3(__hmin2(__half2(a), __half2(b))); -#endif -} - -// FP8 E4M3 vectorized min for 4 elements -__forceinline__ __device__ __fp8x4_e4m3 min_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - // Process as two __fp8x2_e4m3 using min_elements for 2 elements - union { - __fp8x4_e4m3 vec4; - __fp8x2_e4m3 vec2[2]; - } ua, ub, uresult; - ua.vec4 = a; - ub.vec4 = b; - - uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]); - uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]); - - return uresult.vec4; -} - -// FP8 E5M2 min operation (single element) -template <> -__forceinline__ __device__ __fp8_e5m2 min_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(__HIP_PLATFORM_AMD__) - return __fp8_e5m2(fminf(float(a), float(b))); -#else - return __fp8_e5m2(__hmin(__half(a), __half(b))); -#endif -} - -#if !defined(__HIP_PLATFORM_AMD__) -// FP8 E5M2 vectorized min for 2 elements (CUDA only) -__forceinline__ __device__ __fp8x2_e5m2 min_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - return __fp8x2_e5m2(__hmin2(__half2(a), __half2(b))); -} - -// FP8 E5M2 vectorized min for 4 elements (CUDA only) -__forceinline__ __device__ __fp8x4_e5m2 min_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - // Process as two __fp8x2_e5m2 using min_elements for 2 elements - union { - __fp8x4_e5m2 vec4; - __fp8x2_e5m2 vec2[2]; - } ua, ub, uresult; - ua.vec4 = a; - ub.vec4 = b; - - uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]); - uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]); - - return uresult.vec4; -} -#endif // !defined(__HIP_PLATFORM_AMD__) -#endif // __FP8_TYPES_EXIST__ - -template -__forceinline__ __device__ T cal_elements(T a, T b) { - if constexpr (OpType == SUM) { - return add_elements(a, b); - } else if constexpr (OpType == MIN) { - return min_elements(a, b); - } - // Should never reach here - return a; -} - -template -__forceinline__ __device__ int4 cal_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(cal_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(cal_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -__forceinline__ __device__ uint2 cal_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -__forceinline__ __device__ int cal_vectors_helper(int a, int b) { - return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); -} - -#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__) -// Helper function to perform FP8 vector addition - dispatches based on scalar type -// Uses AMD builtins from hip/amd_detail/amd_hip_fp8.h: -// - __builtin_amdgcn_cvt_pk_f32_fp8/bf8: Convert 2 FP8 values to 2 floats -// - __builtin_amdgcn_cvt_pk_fp8/bf8_f32: Convert 2 floats to 2 FP8 values -// The 'word' parameter (false/true) selects low/high 16-bit word from uint32_t -template -__forceinline__ __device__ int add_fp8x4_hip(int a, int b) { - uint32_t a32 = static_cast(a); - uint32_t b32 = static_cast(b); - - float2 v_low, v_high; - uint32_t ival = 0; - - if constexpr (std::is_same_v) { - // E4M3 using fp8 conversion - process low word (false) and high word (true) - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_low) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false))); - uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false); - - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_high) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true))); - uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false); - - uint32_t result = (static_cast(result_high) << 16) | result_low; - return static_cast(result); - } else { // __fp8_e5m2 - // E5M2 using bf8 conversion - process low word (false) and high word (true) - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_low) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false))); - uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false); - - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_high) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true))); - uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false); - - uint32_t result = (static_cast(result_high) << 16) | result_low; - return static_cast(result); - } -} -#endif - -template -__forceinline__ __device__ DataType cal_vectors(DataType a, DataType b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__) - // For FP8 types on HIP gfx942, use specialized helper that dispatches based on scalar type - if constexpr (std::is_same_v || std::is_same_v) { - if constexpr (OpType == SUM) { - if constexpr (std::is_same_v || std::is_same_v) { - // Handle int/uint32_t (4 FP8 elements) - return add_fp8x4_hip(a, b); - } else if constexpr (std::is_same_v) { - // Handle int4 (16 FP8 elements) - process as 4 ints - int4 ret; - ret.w = add_fp8x4_hip(a.w, b.w); - ret.x = add_fp8x4_hip(a.x, b.x); - ret.y = add_fp8x4_hip(a.y, b.y); - ret.z = add_fp8x4_hip(a.z, b.z); - return ret; - } else if constexpr (std::is_same_v) { - // Handle uint2 (8 FP8 elements) - process as 2 ints - uint2 ret; - ret.x = add_fp8x4_hip(a.x, b.x); - ret.y = add_fp8x4_hip(a.y, b.y); - return ret; - } - } - } -#endif - - // Define the vectorized computation type based on the element type - using CompType = typename std::conditional_t< - std::is_same_v, __half2, - std::conditional_t, __bfloat162, -#if defined(__FP8_TYPES_EXIST__) - std::conditional_t, __fp8x4_e4m3, - std::conditional_t, __fp8x4_e5m2, -#endif - T -#if defined(__FP8_TYPES_EXIST__) - >>>>; -#else - >>; -#endif - return cal_vectors_helper(a, b); -} - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 template MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() { @@ -472,7 +42,12 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src // nvls can only handle 4 bytes alignment MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); constexpr size_t nElem = calcVectorSize(); - using vectorType = mscclpp::VectorType; + // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations + constexpr size_t vecSize = (std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v) + ? 1 + : nElem; + using vectorType = mscclpp::VectorType; const size_t nVec = size / sizeof(vectorType); const size_t srcOffset4 = srcOffset / sizeof(vectorType); const size_t dstOffset4 = dstOffset / sizeof(vectorType); @@ -500,53 +75,53 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src using AllreduceFunc = std::function*, mscclpp::DeviceHandle*, size_t, size_t, size_t, int, int, int, - size_t, cudaStream_t, void*, uint32_t, int, int)>; + size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>; -template