From 5d18835417da4d3d95841179e19f69aeebf796f4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 19 Mar 2026 11:52:09 -0700 Subject: [PATCH 01/21] Fix use-after-free for fabric allocation handle in GpuIpcMemHandle (#764) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fix a use-after-free where the CUDA allocation handle (`CUmemGenericAllocationHandle`) was released prematurely while the exported fabric handle still referenced it. ## Problem Unlike POSIX FD handles (where the kernel keeps the allocation alive via the open file descriptor), fabric handles do not hold their own reference to the underlying allocation. The original code called `cuMemRelease(allocHandle)` immediately after exporting the fabric handle, freeing the allocation. When a remote process later tries to `cuMemImportFromShareableHandle` using that fabric handle, it references a freed allocation — a **use-after-free**. This affected both code paths: 1. **`GpuIpcMemHandle::create()`**: The local `allocHandle` obtained via `cuMemRetainAllocationHandle` was released right after fabric export, leaving the fabric handle dangling. 2. **`GpuIpcMemHandle::createMulticast()`**: The `allocHandle` from `cuMulticastCreate` was unconditionally released, even when it was the only thing keeping the multicast object alive for the fabric handle. ## Fix - **Added `allocHandle` field** to the `fabric` struct in `GpuIpcMemHandle` to store the allocation handle and keep it alive for the lifetime of the `GpuIpcMemHandle`. - **`create()`**: Retain an additional reference via `cuMemRetainAllocationHandle` and store it in `fabric.allocHandle` when a fabric handle is successfully exported. - **`createMulticast()`**: Store the `allocHandle` directly in `fabric.allocHandle` instead of unconditionally releasing it. Only release if fabric export was not used. - **`deleter()`**: Release `fabric.allocHandle` via `cuMemRelease` when the handle type includes `Fabric`, ensuring proper cleanup. - **`GpuIpcMem` constructor (importer side)**: Clear `fabric.allocHandle` after importing, since the importer gets its own handle via `cuMemImportFromShareableHandle` and should not release the exporter's allocation handle. ## Files Changed - `src/core/include/gpu_ipc_mem.hpp` — Added `CUmemGenericAllocationHandle allocHandle` to fabric struct. - `src/core/gpu_ipc_mem.cc` — Retain/release allocation handle properly across create, createMulticast, deleter, and importer paths. --- src/core/gpu_ipc_mem.cc | 18 +++++++++++++++--- src/core/include/gpu_ipc_mem.hpp | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc index bc9d375d..c863ecdd 100644 --- a/src/core/gpu_ipc_mem.cc +++ b/src/core/gpu_ipc_mem.cc @@ -140,6 +140,11 @@ void GpuIpcMemHandle::deleter(GpuIpcMemHandle* handle) { UnixSocketServer::instance().unregisterFd(handle->posixFd.fd); ::close(handle->posixFd.fd); } + if (handle->typeFlags & GpuIpcMemHandle::Type::Fabric) { + if (handle->fabric.allocHandle != 0) { + cuMemRelease(handle->fabric.allocHandle); + } + } delete handle; } } @@ -148,6 +153,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) { auto handle = UniqueGpuIpcMemHandle(new GpuIpcMemHandle(), &GpuIpcMemHandle::deleter); handle->typeFlags = GpuIpcMemHandle::Type::None; handle->posixFd.fd = -1; + handle->fabric.allocHandle = {}; CUdeviceptr basePtr; size_t sz; @@ -189,6 +195,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::create(const CUdeviceptr ptr) { // FABRIC handle if (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS) { + MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&(handle->fabric.allocHandle), (void*)basePtr)); handle->typeFlags |= GpuIpcMemHandle::Type::Fabric; } @@ -232,6 +239,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b handle->offsetFromBase = 0; handle->typeFlags = GpuIpcMemHandle::Type::None; handle->posixFd.fd = -1; + handle->fabric.allocHandle = {}; // POSIX FD handle int fileDesc; @@ -246,6 +254,7 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b if (isFabricAvailable && (cuMemExportToShareableHandle(&(handle->fabric.handle), allocHandle, CU_MEM_HANDLE_TYPE_FABRIC, 0) == CUDA_SUCCESS)) { handle->typeFlags |= GpuIpcMemHandle::Type::Fabric; + handle->fabric.allocHandle = allocHandle; } if (handle->typeFlags == GpuIpcMemHandle::Type::None) { @@ -253,9 +262,10 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created"); } - // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the - // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle. - MSCCLPP_CUTHROW(cuMemRelease(allocHandle)); + // Only release allocHandle if it is not stored in fabric.allocHandle. + if (!(handle->typeFlags & GpuIpcMemHandle::Type::Fabric)) { + MSCCLPP_CUTHROW(cuMemRelease(allocHandle)); + } return handle; #else // !(CUDA_NVLS_API_AVAILABLE) THROW(GPU, Error, ErrorCode::InvalidUsage, @@ -275,6 +285,8 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle) if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) { if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) == CUDA_SUCCESS) { + // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes. + handle_.fabric.allocHandle = {}; type_ = GpuIpcMemHandle::Type::Fabric; } } diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp index 923e807d..f66545c2 100644 --- a/src/core/include/gpu_ipc_mem.hpp +++ b/src/core/include/gpu_ipc_mem.hpp @@ -44,6 +44,7 @@ struct GpuIpcMemHandle { struct { char handle[64]; + CUmemGenericAllocationHandle allocHandle; } fabric; static void deleter(GpuIpcMemHandle* handle); From 93f6eeaa6b3db46cdf11d659835e81cedc9c94ff Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 23:34:38 -0400 Subject: [PATCH 02/21] Remove GTest dependency, add code coverage, and refactor unit tests and CI pipelines (#744) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Removes the GTest dependency, replacing it with a minimal custom framework (`test/framework.*`) that covers only what the tests actually use — a unified `TEST()` macro with SFINAE-based fixture auto-detection, `EXPECT_*`/`ASSERT_*` assertions, environments, and setup/teardown. - `--exclude-perf-tests` flag and substring-based negative filtering - `MSCCLPP_ENABLE_COVERAGE` CMake option with gcov/lcov; CI uploads to Codecov - Merges standalone `test/perf/` into main test targets - Refactors Azure pipelines to reduce redundancies & make more readable --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Changho Hwang --- .azure-pipelines/codecov.yml | 93 ++++ .azure-pipelines/integration-test.yml | 6 +- .azure-pipelines/multi-nodes-test.yml | 164 ++----- .../{nccl-api-test.yaml => nccl-api-test.yml} | 6 +- .azure-pipelines/rccl-api-test.yml | 3 +- .azure-pipelines/templates/codecov.yml | 110 +++++ .azure-pipelines/templates/deploy.yml | 131 ++++++ .../templates/integration-test.yaml | 242 ----------- .../templates/integration-test.yml | 76 ++++ .azure-pipelines/templates/nccl-test.yaml | 282 ------------ .azure-pipelines/templates/nccl-test.yml | 76 ++++ .azure-pipelines/templates/rccl-test.yaml | 142 ------ .azure-pipelines/templates/rccl-test.yml | 63 +++ .../templates/run-remote-task.yml | 27 ++ .azure-pipelines/templates/stop.yml | 20 + .azure-pipelines/templates/ut-no-ib-env.yaml | 191 --------- .azure-pipelines/templates/ut-no-ib-env.yml | 95 ++++ .azure-pipelines/templates/ut-npkit.yaml | 145 ------- .azure-pipelines/templates/ut-npkit.yml | 57 +++ .azure-pipelines/templates/ut.yaml | 142 ------ .azure-pipelines/templates/ut.yml | 48 +++ .azure-pipelines/ut-rocm.yml | 50 --- .azure-pipelines/ut.yml | 45 +- .codecov.yml | 24 ++ .github/workflows/codeql-analysis.yml | 6 +- .../{doc-build.yaml => doc-build.yml} | 0 .github/workflows/integration-test-backup.yml | 69 --- .github/workflows/mscclpp-lang.yml | 2 +- .github/workflows/ut-backup.yml | 52 --- .gitignore | 1 + CMakeLists.txt | 59 ++- README.md | 13 +- docker/base-dev-x.dockerfile | 30 +- docker/build.sh | 18 +- docs/quickstart.md | 5 +- test/CMakeLists.txt | 27 +- test/deploy/deploy.sh | 2 +- test/deploy/run-remote.sh | 107 +++++ test/deploy/run_tests.sh | 1 - test/executor_test.cc | 10 +- test/framework.cc | 323 ++++++++++++++ test/framework.hpp | 405 ++++++++++++++++++ test/mp_unit/bootstrap_tests.cc | 18 +- test/mp_unit/communicator_tests.cu | 8 +- test/mp_unit/executor_tests.cc | 7 +- test/mp_unit/ib_tests.cu | 12 +- test/mp_unit/memory_channel_tests.cu | 95 ++-- test/mp_unit/mp_unit_tests.cc | 17 +- test/mp_unit/mp_unit_tests.hpp | 14 +- test/mp_unit/port_channel_tests.cu | 145 ++----- test/mp_unit/switch_channel_tests.cu | 45 +- test/perf/CMakeLists.txt | 44 -- test/perf/fifo_test.cu | 298 ------------- test/perf/framework.cc | 208 --------- test/perf/framework.hpp | 80 ---- test/unit/CMakeLists.txt | 4 +- test/unit/compile_tests.cu | 4 +- test/unit/core_tests.cc | 20 +- test/unit/errors_tests.cc | 17 +- test/unit/fifo_perf_tests.cu | 85 ++++ test/unit/fifo_tests.cu | 5 +- test/unit/gpu_utils_tests.cc | 6 +- test/unit/local_channel_tests.cu | 6 +- test/unit/numa_tests.cc | 6 +- test/unit/socket_tests.cc | 5 +- test/unit/unit_tests_main.cc | 6 + test/unit/utils_internal_tests.cc | 3 +- test/unit/utils_tests.cc | 6 +- 68 files changed, 2116 insertions(+), 2416 deletions(-) create mode 100644 .azure-pipelines/codecov.yml rename .azure-pipelines/{nccl-api-test.yaml => nccl-api-test.yml} (88%) create mode 100644 .azure-pipelines/templates/codecov.yml create mode 100644 .azure-pipelines/templates/deploy.yml delete mode 100644 .azure-pipelines/templates/integration-test.yaml create mode 100644 .azure-pipelines/templates/integration-test.yml delete mode 100644 .azure-pipelines/templates/nccl-test.yaml create mode 100644 .azure-pipelines/templates/nccl-test.yml delete mode 100644 .azure-pipelines/templates/rccl-test.yaml create mode 100644 .azure-pipelines/templates/rccl-test.yml create mode 100644 .azure-pipelines/templates/run-remote-task.yml create mode 100644 .azure-pipelines/templates/stop.yml delete mode 100644 .azure-pipelines/templates/ut-no-ib-env.yaml create mode 100644 .azure-pipelines/templates/ut-no-ib-env.yml delete mode 100644 .azure-pipelines/templates/ut-npkit.yaml create mode 100644 .azure-pipelines/templates/ut-npkit.yml delete mode 100644 .azure-pipelines/templates/ut.yaml create mode 100644 .azure-pipelines/templates/ut.yml delete mode 100644 .azure-pipelines/ut-rocm.yml create mode 100644 .codecov.yml rename .github/workflows/{doc-build.yaml => doc-build.yml} (100%) delete mode 100644 .github/workflows/integration-test-backup.yml delete mode 100644 .github/workflows/ut-backup.yml create mode 100755 test/deploy/run-remote.sh create mode 100644 test/framework.cc create mode 100644 test/framework.hpp delete mode 100644 test/perf/CMakeLists.txt delete mode 100644 test/perf/fifo_test.cu delete mode 100644 test/perf/framework.cc delete mode 100644 test/perf/framework.hpp create mode 100644 test/unit/fifo_perf_tests.cu create mode 100644 test/unit/unit_tests_main.cc diff --git a/.azure-pipelines/codecov.yml b/.azure-pipelines/codecov.yml new file mode 100644 index 00000000..c4abeaa7 --- /dev/null +++ b/.azure-pipelines/codecov.yml @@ -0,0 +1,93 @@ +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +jobs: +- job: CodeCoverageA100 + timeoutInMinutes: 40 + pool: + name: msccl-ci + variables: + - group: mscclpp + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci + vmssName: mscclpp-ci + gpuArch: '80' + +- job: CodeCoverageH100 + timeoutInMinutes: 40 + pool: + name: msccl-ci-h100 + variables: + - group: mscclpp + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + gpuArch: '90' + +- job: CodeCoverageMI300X + timeoutInMinutes: 40 + pool: + name: msccl-ci-mi300x + variables: + - group: mscclpp + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/codecov.yml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + platform: rocm + gpuArch: gfx942 diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index f6fe3a47..d5d5f9bd 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -41,11 +41,10 @@ jobs: image: $(containerImage) steps: - - template: templates/integration-test.yaml + - template: templates/integration-test.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: IntegrationTestH100 @@ -61,10 +60,9 @@ jobs: image: $(containerImage) steps: - - template: templates/integration-test.yaml + - template: templates/integration-test.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem perfBaselineFile: test/deploy/perf_ndmv5.jsonl gpuArch: '90' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 914c2317..d4924879 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -37,33 +37,6 @@ jobs: image: $[ variables['containerImage'] ] steps: - - task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: mscclpp-ssh.key - - - task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - - task: Bash@3 displayName: Add HostEntry inputs: @@ -77,107 +50,46 @@ jobs: echo "Entry already exists, nothing to do." fi - - task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name mscclit-vmss --resource-group msccl-IT + - template: templates/deploy.yml + parameters: + subscription: msccl-it + vmssName: mscclit-vmss + resourceGroup: msccl-IT - - task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - workingDirectory: '$(System.DefaultWorkingDirectory)' + - template: templates/run-remote-task.yml + parameters: + name: RunMscclppTest + displayName: Run multi-nodes mscclpp-test + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test - - task: Bash@3 - name: RunMscclppTest - displayName: Run multi-nodes mscclpp-test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodeUnitTest + displayName: Run multi-nodes unit tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh mp-ut - - task: Bash@3 - name: RunMultiNodeUnitTest - displayName: Run multi-nodes unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonTests + displayName: Run multi-nodes python tests + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh pytests - - task: Bash@3 - name: RunMultiNodePythonTests - displayName: Run multi-nodes python tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' - kill $CHILD_PID + - template: templates/run-remote-task.yml + parameters: + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser' + remoteScript: | + bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark - - task: Bash@3 - name: RunMultiNodePythonBenchmark - displayName: Run multi-nodes python benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - rm -rf output/* - mkdir -p output - touch output/mscclit-000000 - tail -f output/mscclit-000000 & - CHILD_PID=$! - parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' - kill $CHILD_PID - - - task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: msccl-it - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name mscclit-vmss --resource-group msccl-IT + - template: templates/stop.yml + parameters: + subscription: msccl-it + vmssName: mscclit-vmss + resourceGroup: msccl-IT diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yml similarity index 88% rename from .azure-pipelines/nccl-api-test.yaml rename to .azure-pipelines/nccl-api-test.yml index 4951c5bd..cc017412 100644 --- a/.azure-pipelines/nccl-api-test.yaml +++ b/.azure-pipelines/nccl-api-test.yml @@ -40,11 +40,10 @@ jobs: image: $(containerImage) steps: - - template: templates/nccl-test.yaml + - template: templates/nccl-test.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem nvccGencode: "-gencode=arch=compute_80,code=sm_80" - job: NcclTestH100 @@ -61,9 +60,8 @@ jobs: image: $(containerImage) steps: - - template: templates/nccl-test.yaml + - template: templates/nccl-test.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem nvccGencode: "-gencode=arch=compute_90,code=sm_90" \ No newline at end of file diff --git a/.azure-pipelines/rccl-api-test.yml b/.azure-pipelines/rccl-api-test.yml index 92c5874f..43841079 100644 --- a/.azure-pipelines/rccl-api-test.yml +++ b/.azure-pipelines/rccl-api-test.yml @@ -40,9 +40,8 @@ jobs: image: $(containerImage) steps: - - template: templates/rccl-test.yaml + - template: templates/rccl-test.yml parameters: subscription: mscclpp-ci-mi300x vmssName: mscclpp-mi300x-ci - sshKeySecureFile: mscclpp.pem gpuArch: gfx942 diff --git a/.azure-pipelines/templates/codecov.yml b/.azure-pipelines/templates/codecov.yml new file mode 100644 index 00000000..08797351 --- /dev/null +++ b/.azure-pipelines/templates/codecov.yml @@ -0,0 +1,110 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + buildType: Debug + cmakeArgs: '-DMSCCLPP_ENABLE_COVERAGE=ON' + buildDisplayName: 'Build with coverage' + buildName: BuildCoverage + deployArgs: 'single-node-test true ${{ parameters.platform }}' + +- template: run-remote-task.yml + parameters: + name: TestsCoverageNonPerf + displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage + remoteScript: | + BUILD_PREFIX=$(cat build/BUILD_PREFIX) + STRIP_COUNT=$(echo $BUILD_PREFIX | tr -cd / | wc -c) + export GCOV_PREFIX=/root/mscclpp + export GCOV_PREFIX_STRIP=$STRIP_COUNT + + echo "Running unit_tests..." + ./build/bin/unit_tests + echo "unit_tests: PASSED" + + echo "Running mp_unit_tests -np 2..." + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --exclude-perf-tests + echo "mp_unit_tests -np 2: PASSED" + + echo "Running mp_unit_tests -np 4..." + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests + echo "mp_unit_tests -np 4: PASSED" + +- template: run-remote-task.yml + parameters: + name: CaptureCoverage + displayName: Capture coverage data with lcov + remoteScript: | + BUILD_PREFIX=$(cat build/BUILD_PREFIX) + + GCOV_TOOL_ARG="" + if [ "${{ parameters.platform }}" = "rocm" ]; then + apt-get update -qq && apt-get install -y -qq llvm 2>/dev/null | tail -1 + GCOV_WRAPPER=$(mktemp) + printf '#!/bin/sh\nexec llvm-cov gcov "$@"\n' > "$GCOV_WRAPPER" + chmod +x "$GCOV_WRAPPER" + GCOV_TOOL_ARG="--gcov-tool ${GCOV_WRAPPER}" + fi + + lcov --version + LCOV_CAPTURE_ARGS="" + if lcov --help 2>&1 | grep -q "inconsistent"; then + LCOV_CAPTURE_ARGS="--ignore-errors inconsistent" + fi + + lcov ${GCOV_TOOL_ARG} --directory . --capture --output-file coverage.info ${LCOV_CAPTURE_ARGS} + if [ ! -s coverage.info ]; then + echo "ERROR: coverage.info was not generated." + exit 1 + fi + + lcov ${GCOV_TOOL_ARG} --extract coverage.info "${BUILD_PREFIX}/src/*" "${BUILD_PREFIX}/include/mscclpp/*" --output-file coverage.info + lcov --list coverage.info + ls -la coverage.info + +- task: Bash@3 + name: FetchCoverage + displayName: Fetch coverage data from remote VM + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + HOST=$(head -1 ${HOSTFILE}) + ssh -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST} \ + 'sudo docker cp mscclpp-test:/root/mscclpp/coverage.info /tmp/coverage.info' + scp -i ${KeyFilePath} -o ${SSH_OPTION} ${HOST}:/tmp/coverage.info $(System.DefaultWorkingDirectory)/coverage.info + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: UploadCodecov + displayName: Upload coverage to Codecov + inputs: + targetType: 'inline' + script: | + set -e + curl -Os https://cli.codecov.io/latest/linux/codecov + chmod +x codecov + ./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }} + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml new file mode 100644 index 00000000..fc116acf --- /dev/null +++ b/.azure-pipelines/templates/deploy.yml @@ -0,0 +1,131 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: resourceGroup + type: string + default: mscclpp +# Build parameters +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + default: '' +- name: buildType + type: string + default: 'Release' +- name: buildTests + type: string + default: 'true' +- name: cmakeArgs + type: string + default: '' +- name: buildName + type: string + default: 'Build' +- name: buildDisplayName + type: string + default: 'Build' +# Deploy parameters +- name: deployArgs + type: string + default: '' + +steps: +# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks. +- task: Bash@3 + name: EnsureAzureCLI + displayName: Ensure Azure CLI Installed + inputs: + targetType: inline + script: | + set -e + if command -v az >/dev/null 2>&1; then + az version >/dev/null + exit 0 + fi + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +# 1. Build +- task: Bash@3 + name: ${{ parameters.buildName }} + displayName: ${{ parameters.buildDisplayName }} + inputs: + targetType: 'inline' + script: | + set -e + rm -rf build + mkdir -p build && cd build + BUILD_TESTS_ARG="" + if [ "${{ parameters.buildTests }}" = "true" ]; then + BUILD_TESTS_ARG="-DMSCCLPP_BUILD_TESTS=ON" + fi + + GPU_ARCH_ARG="" + if [ -n "${{ parameters.gpuArch }}" ]; then + GPU_ARCH_ARG="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}" + fi + + CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}' + if [ "${{ parameters.platform }}" = "rocm" ]; then + eval CXX=/opt/rocm/bin/hipcc cmake \ + -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_ROCM=ON \ + ${BUILD_TESTS_ARG} \ + ${GPU_ARCH_ARG} \ + ${CMAKE_EXTRA_ARGS} .. + else + eval cmake \ + -DCMAKE_BUILD_TYPE=${{ parameters.buildType }} \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_CUDA=ON \ + ${BUILD_TESTS_ARG} \ + ${GPU_ARCH_ARG} \ + ${CMAKE_EXTRA_ARGS} .. + fi + make -j + cd .. + pwd > build/BUILD_PREFIX + echo "=== Build artifacts ===" + ls -la build/bin/ || echo "ERROR: build/bin/ missing after build" + du -sh build/bin/* 2>/dev/null || true + workingDirectory: '$(System.DefaultWorkingDirectory)' + +# 2. Download SSH key + install packages + start VMSS +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: mscclpp.pem + +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: 'inline' + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} + +# 3. Deploy test environment +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: ${{ parameters.deployArgs }} + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/templates/integration-test.yaml b/.azure-pipelines/templates/integration-test.yaml deleted file mode 100644 index 99ed6d04..00000000 --- a/.azure-pipelines/templates/integration-test.yaml +++ /dev/null @@ -1,242 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: perfBaselineFile - type: string - default: 'test/deploy/perf_ndmv4.jsonl' -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: inline - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: inline - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllGatherTest - displayName: Run mscclpp AllGather test - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - set -e; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: SendRecvTest - displayName: Run mscclpp SendRecv test - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllReduceTest - displayName: Run mscclpp AllReduce test - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: AllToAll - displayName: Run mscclpp AllToAll test - inputs: - targetType: 'inline' - script: | - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: CheckPerfNumber - displayName: Check collective primitives performance - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - cd /root/mscclpp; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PythonAllReduceBenchmark - displayName: Python Allreduce Benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - set -e; \ - cd /root/mscclpp; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - python3 -m pip install .; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: FifoPerfBenchmark - displayName: FIFO Performance Benchmark - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - set -e; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - ./build/bin/perf/fifo_test"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file diff --git a/.azure-pipelines/templates/integration-test.yml b/.azure-pipelines/templates/integration-test.yml new file mode 100644 index 00000000..b686e4f2 --- /dev/null +++ b/.azure-pipelines/templates/integration-test.yml @@ -0,0 +1,76 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: perfBaselineFile + type: string + default: 'test/deploy/perf_ndmv4.jsonl' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test' + +- template: run-remote-task.yml + parameters: + name: AllGatherTest + displayName: Run mscclpp AllGather test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: SendRecvTest + displayName: Run mscclpp SendRecv test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: AllReduceTest + displayName: Run mscclpp AllReduce test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: AllToAll + displayName: Run mscclpp AllToAll test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + +- template: run-remote-task.yml + parameters: + name: CheckPerfNumber + displayName: Check collective primitives performance + remoteScript: | + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }} + +- template: run-remote-task.yml + parameters: + name: PythonAllReduceBenchmark + displayName: Python Allreduce Benchmark + remoteScript: | + python3 -m pip install . + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} \ No newline at end of file diff --git a/.azure-pipelines/templates/nccl-test.yaml b/.azure-pipelines/templates/nccl-test.yaml deleted file mode 100644 index ef4a9fa8..00000000 --- a/.azure-pipelines/templates/nccl-test.yaml +++ /dev/null @@ -1,282 +0,0 @@ -# .azure-pipelines/templates/nccl-test.yaml -# ---------------------------------------- -# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container. -# -# Parameters: -# subscription – Azure subscription to use for VMSS start/stop -# sshKeySecureFile – the secureFile name for your SSH key - -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: nvccGencode - type: string - default: "-gencode=arch=compute_80,code=sm_80" - -steps: -- checkout: self -- checkout: git://One/msccl-users -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: mscclpp/test/deploy/deploy.sh - arguments: nccltest-single-node - workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp - -- task: Bash@3 - name: CopyMscclUsers - displayName: Copy msccl-users - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - DST_DIR="/tmp/mscclpp/msccl-users" - parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: GenerateExecutionFile -# displayName: Generate execution file -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp/msccl-users; \ -# mkdir -p execution-files; \ -# cd /root/mscclpp/msccl-users; \ -# bash algos/mscclpp_a100/generate_execution_plan.sh"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallNcclTests - displayName: Install NCCL Tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd; git clone https://github.com/NVIDIA/nccl-tests.git; \ - cd nccl-tests; \ - MPI=1 MPI_HOME=/usr/local/mpi make -j"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclAllReduceTest -# displayName: Run NCCL AllReduce Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclAllGatherTest -# displayName: Run NCCL AllGather Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclReduceScatterTest -# displayName: Run NCCL Reduce Scatter Test -# inputs: -# targetType: inline -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: InstallNccl - displayName: Install NCCL - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - LATEST_TAG=\$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\\\" -f4); \ - if [ -z \"\$LATEST_TAG\" ]; then echo \"Failed to fetch latest NCCL tag\"; exit 1; fi; \ - cd; git clone --branch \$LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git; \ - cd nccl; \ - make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclAllGatherFallbaclkToNcclTest - displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclAllReduceFallbaclkToNcclTest - displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allgather\" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunNcclBroadcastFallbaclkToNcclTest - displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"allreduce\" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -# - task: Bash@3 -# name: RunNcclReduceScatterFallbaclkToNcclTest -# displayName: Run NCCL ReduceScatter Test with or without Fallback to NCCL operation -# inputs: -# targetType: 'inline' -# script: | -# set -e -# HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci -# ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp -# SSH_OPTION="StrictHostKeyChecking=no" -# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} -# parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -# -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ -# cd /root/mscclpp; \ -# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"reducescatter\" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="reducescatter" /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ -# echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION=\"broadcast\" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\"; \ -# mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/reduce_scatter_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' -# workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml new file mode 100644 index 00000000..211e2393 --- /dev/null +++ b/.azure-pipelines/templates/nccl-test.yml @@ -0,0 +1,76 @@ +# .azure-pipelines/templates/nccl-test.yml +# ---------------------------------------- +# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container. +# +# Parameters: +# subscription – Azure subscription to use for VMSS start/stop + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: nvccGencode + type: string + default: "-gencode=arch=compute_80,code=sm_80" + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + deployArgs: 'nccltest-single-node' + +- template: run-remote-task.yml + parameters: + name: InstallNcclTests + displayName: Install NCCL Tests + remoteScript: | + cd + git clone https://github.com/NVIDIA/nccl-tests.git + cd nccl-tests + MPI=1 MPI_HOME=/usr/local/mpi make -j + +- template: run-remote-task.yml + parameters: + name: InstallNccl + displayName: Install NCCL + remoteScript: | + LATEST_TAG=$(curl -fsSL https://api.github.com/repos/NVIDIA/nccl/releases/latest | grep tag_name | cut -d\" -f4) + if [ -z "$LATEST_TAG" ]; then + echo "Failed to fetch latest NCCL tag" + exit 1 + fi + cd + git clone --branch $LATEST_TAG --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl + make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }} + +- template: run-remote-task.yml + parameters: + name: RunNcclAllGatherFallbaclkToNcclTest + displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunNcclAllReduceFallbaclkToNcclTest + displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunNcclBroadcastFallbaclkToNcclTest + displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/rccl-test.yaml b/.azure-pipelines/templates/rccl-test.yaml deleted file mode 100644 index 040605df..00000000 --- a/.azure-pipelines/templates/rccl-test.yaml +++ /dev/null @@ -1,142 +0,0 @@ -# .azure-pipelines/templates/rccl-test.yaml -# ------------------------------------------------ -# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container. -# -# Parameters: -# subscription – Azure subscription to use for VMSS start/stop -# vmssName – VMSS name to start/stop -# sshKeySecureFile – the secureFile name for your SSH key -# gpuArch – GPU architecture (e.g. gfx942) - -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - default: "gfx942" - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test true rocm" - workingDirectory: $(System.DefaultWorkingDirectory) - - -- task: Bash@3 - name: InstallRcclTests - displayName: Install RCCL Tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory) - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd; \ - git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git; \ - cd rocm-systems; \ - git sparse-checkout init --cone; \ - git sparse-checkout set projects/rccl-tests; \ - git checkout; \ - cd projects/rccl-tests; \ - MPI=1 MPI_HOME=/usr/local/mpi make -j"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunRcclAllGatherTest - displayName: Run RCCL AllGather Test with or without MSCCLPP Lib - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory) - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: RunRcclAllReduceTest - displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - ROOT_DIR=$(System.DefaultWorkingDirectory) - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20; \ - echo \"mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20\";\ - mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"' - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml new file mode 100644 index 00000000..8e247161 --- /dev/null +++ b/.azure-pipelines/templates/rccl-test.yml @@ -0,0 +1,63 @@ +# .azure-pipelines/templates/rccl-test.yml +# ------------------------------------------------ +# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container. +# +# Parameters: +# subscription – Azure subscription to use for VMSS start/stop +# vmssName – VMSS name to start/stop +# gpuArch – GPU architecture (e.g. gfx942) + +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + default: "gfx942" + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: rocm + gpuArch: ${{ parameters.gpuArch }} + buildTests: false + deployArgs: 'single-node-test true rocm' + + +- template: run-remote-task.yml + parameters: + name: InstallRcclTests + displayName: Install RCCL Tests + remoteScript: | + cd + git clone --filter=blob:none --no-checkout https://github.com/ROCm/rocm-systems.git + cd rocm-systems + git sparse-checkout init --cone + git sparse-checkout set projects/rccl-tests + git checkout + cd projects/rccl-tests + MPI=1 MPI_HOME=/usr/local/mpi make -j + +- template: run-remote-task.yml + parameters: + name: RunRcclAllGatherTest + displayName: Run RCCL AllGather Test with or without MSCCLPP Lib + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: run-remote-task.yml + parameters: + name: RunRcclAllReduceTest + displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib + remoteScript: | + mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20 + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml new file mode 100644 index 00000000..37b3a7d7 --- /dev/null +++ b/.azure-pipelines/templates/run-remote-task.yml @@ -0,0 +1,27 @@ +parameters: +- name: name + type: string + default: '' +- name: displayName + type: string +- name: runRemoteArgs + type: string + default: '' +- name: remoteScript + type: string +- name: workingDirectory + type: string + default: '$(System.DefaultWorkingDirectory)' + +steps: +- task: Bash@3 + ${{ if ne(parameters.name, '') }}: + name: ${{ parameters.name }} + displayName: ${{ parameters.displayName }} + inputs: + targetType: 'inline' + script: | + test/deploy/run-remote.sh ${{ parameters.runRemoteArgs }} <<'REMOTE_CMD' + ${{ parameters.remoteScript }} + REMOTE_CMD + workingDirectory: ${{ parameters.workingDirectory }} diff --git a/.azure-pipelines/templates/stop.yml b/.azure-pipelines/templates/stop.yml new file mode 100644 index 00000000..40498c29 --- /dev/null +++ b/.azure-pipelines/templates/stop.yml @@ -0,0 +1,20 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: resourceGroup + type: string + default: mscclpp + +steps: +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml deleted file mode 100644 index 0d97f9fc..00000000 --- a/.azure-pipelines/templates/ut-no-ib-env.yaml +++ /dev/null @@ -1,191 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: single-node-test false - workingDirectory: $(System.DefaultWorkingDirectory) - -- task: Bash@3 - name: UnitTests - displayName: Run mscclpp unit tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - ./build/bin/unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: MpUnitTests - displayName: Run mscclpp multi-process unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: StopContainer - displayName: Stop existing container - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true" - rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: BuildWithIb - displayName: Rebuild with IB - inputs: - targetType: 'inline' - script: | - rm -rf build && mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: DeployTestEnvWithIb - displayName: Deploy Test Env (with IB build) - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: single-node-test false - workingDirectory: $(System.DefaultWorkingDirectory) - -- task: Bash@3 - name: PyTestsWithIbBuildDisableIb - displayName: Run pytests (IB build, IB tests disabled) - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file diff --git a/.azure-pipelines/templates/ut-no-ib-env.yml b/.azure-pipelines/templates/ut-no-ib-env.yml new file mode 100644 index 00000000..a62f1a77 --- /dev/null +++ b/.azure-pipelines/templates/ut-no-ib-env.yml @@ -0,0 +1,95 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + cmakeArgs: '-DMSCCLPP_USE_IB=OFF' + deployArgs: 'single-node-test false' + +- template: run-remote-task.yml + parameters: + name: UnitTests + displayName: Run mscclpp unit tests + remoteScript: | + ./build/bin/unit_tests + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + +- template: run-remote-task.yml + parameters: + name: StopContainer + displayName: Stop existing container + runRemoteArgs: '--no-docker --no-log' + remoteScript: | + sudo docker stop mscclpp-test || true + sudo docker rm mscclpp-test || true + +- task: Bash@3 + displayName: Remove generated SSH key files + inputs: + targetType: 'inline' + script: | + rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: BuildWithIb + displayName: Rebuild with IB + inputs: + targetType: 'inline' + script: | + set -e + rm -rf build + mkdir -p build && cd build + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMSCCLPP_BYPASS_GPU_CHECK=ON \ + -DMSCCLPP_USE_CUDA=ON \ + -DMSCCLPP_BUILD_TESTS=ON \ + -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: DeployTestEnvWithIb + displayName: Deploy Test Env (with IB build) + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: single-node-test false + workingDirectory: $(System.DefaultWorkingDirectory) + +- template: run-remote-task.yml + parameters: + name: PyTestsWithIbBuildDisableIb + displayName: Run pytests (IB build, IB tests disabled) + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut-npkit.yaml b/.azure-pipelines/templates/ut-npkit.yaml deleted file mode 100644 index 5c35317e..00000000 --- a/.azure-pipelines/templates/ut-npkit.yaml +++ /dev/null @@ -1,145 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: gpuArch - type: string - - -steps: -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: inline - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - set -e; \ - cd /root/mscclpp; \ - mkdir -p build && cd build; \ - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \ - make -j"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: MpUnitTests - displayName: Run mscclpp multi-process unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --gtest_filter=\"ExecutorTest.TwoNodesAllreduce\"; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: 'inline' - script: | - # set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json'; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json; \ - rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json'; \ - python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output; \ - grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json; \ - grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml new file mode 100644 index 00000000..e53b5cf5 --- /dev/null +++ b/.azure-pipelines/templates/ut-npkit.yml @@ -0,0 +1,57 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: gpuArch + type: string + + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"' + deployArgs: 'single-node-test' + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests --filter="ExecutorTest.TwoNodesAllreduce" + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + export MSCCLPP_NPKIT_DUMP_DIR=./npkit_dump + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json + rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' + python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output + grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml deleted file mode 100644 index 2086fd0a..00000000 --- a/.azure-pipelines/templates/ut.yaml +++ /dev/null @@ -1,142 +0,0 @@ -parameters: -- name: subscription - type: string -- name: vmssName - type: string -- name: sshKeySecureFile - type: string -- name: platform - type: string - default: 'cuda' -- name: gpuArch - type: string - -steps: -- task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - if [ "${{ parameters.platform }}" == "rocm" ]; then - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - else - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. - fi - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} - -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: 'inline' - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp - -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test true ${{ parameters.platform }}" - workingDirectory: '$(System.DefaultWorkingDirectory)' - - -- task: Bash@3 - name: UnitTests - displayName: Run mscclpp unit tests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - ./build/bin/unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: MpUnitTests - displayName: Run mscclpp multi-process unit tests - inputs: - targetType: 'inline' - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - cd /root/mscclpp; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \ - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: Bash@3 - name: PyTests - displayName: Run pytests - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml new file mode 100644 index 00000000..9d17e923 --- /dev/null +++ b/.azure-pipelines/templates/ut.yml @@ -0,0 +1,48 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: platform + type: string + default: 'cuda' +- name: gpuArch + type: string + +steps: +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + platform: ${{ parameters.platform }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test true ${{ parameters.platform }}' + + +- template: run-remote-task.yml + parameters: + name: UnitTests + displayName: Run mscclpp unit tests + remoteScript: | + ./build/bin/unit_tests + +- template: run-remote-task.yml + parameters: + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + remoteScript: | + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests + +- template: run-remote-task.yml + parameters: + name: PyTests + displayName: Run pytests + remoteScript: | + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml deleted file mode 100644 index 8b0aed1a..00000000 --- a/.azure-pipelines/ut-rocm.yml +++ /dev/null @@ -1,50 +0,0 @@ -trigger: - branches: - include: - - main - - release/* - paths: - exclude: - - .devcontainer/** - - .github/** - - apps/** - - docker/** - - docs/** - - '**/*.md' - -pr: - branches: - include: - - main - - release/* - drafts: false - paths: - exclude: - - .devcontainer/** - - .github/** - - apps/** - - docker/** - - docs/** - - '**/*.md' - -jobs: -- job: UnitTestMI300X - timeoutInMinutes: 40 - pool: - name: msccl-ci-mi300x - strategy: - matrix: - rocm6_2: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 - - container: - image: $(containerImage) - - steps: - - template: templates/ut.yaml - parameters: - subscription: mscclpp-ci-mi300x - vmssName: mscclpp-mi300x-ci - sshKeySecureFile: mscclpp.pem - platform: rocm - gpuArch: gfx942 diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 4aac07e6..4e6f96b1 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -37,17 +37,16 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut.yaml + - template: templates/ut.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: UnitTestWithNpKitA100 @@ -59,17 +58,16 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-npkit.yaml + - template: templates/ut-npkit.yml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' - job: UnitTestH100 @@ -79,17 +77,16 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut.yaml + - template: templates/ut.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' - job: UnitTestWithNpKitH100 @@ -99,17 +96,16 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-npkit.yaml + - template: templates/ut-npkit.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' - job: UnitTestNoIBEnv @@ -121,15 +117,34 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) steps: - - template: templates/ut-no-ib-env.yaml + - template: templates/ut-no-ib-env.yml parameters: subscription: mscclpp-ci-h100 vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem gpuArch: '90' + +- job: UnitTestMI300X + timeoutInMinutes: 40 + pool: + name: msccl-ci-mi300x + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/ut.yml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + platform: rocm + gpuArch: gfx942 diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 00000000..a98f1e89 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,24 @@ +codecov: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + target: 68% + threshold: 1% + patch: + default: + target: 80% + +flag_management: + default_rules: + carryforward: true + +ignore: + - "test/" + - "examples/" + - "python/" + - "tools/" + - "docs/" + - "docker/" diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index db3b488a..fb065141 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - name: Checkout repository @@ -62,7 +62,7 @@ jobs: - name: Build run: | rm -rf build && mkdir build && cd build - cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON .. + cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=OFF .. make -j4 - name: Perform CodeQL Analysis @@ -107,7 +107,7 @@ jobs: - name: Build run: | rm -rf build && mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON .. + CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=OFF .. make -j4 - name: Perform CodeQL Analysis diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yml similarity index 100% rename from .github/workflows/doc-build.yaml rename to .github/workflows/doc-build.yml diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml deleted file mode 100644 index 900e8aba..00000000 --- a/.github/workflows/integration-test-backup.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: IntegrationTest - -on: workflow_dispatch - -jobs: - IntegrationTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - - - name: Lock GPU clock frequency - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: Run mscclpp AllGather test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - - - name: Run mscclpp SendRecv test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl - - - name: Run mscclpp AllReduce test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl - - - name: Run mscclpp AllToAll test - run: | - set -e - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - - - name: Check collective primitives performance - run: | - set -e - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml index 5947b087..a9187e96 100644 --- a/.github/workflows/mscclpp-lang.yml +++ b/.github/workflows/mscclpp-lang.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml deleted file mode 100644 index 8849c353..00000000 --- a/.github/workflows/ut-backup.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: UnitTest - -on: workflow_dispatch - -jobs: - UnitTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - timeout-minutes: 30 - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - - container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - working-directory: ${{ github.workspace }} - - - name: LockGPUClock - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: UnitTests - run: | - ./build/bin/unit_tests - - - name: MpUnitTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests - mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests - - - name: PyTests - run: | - set -e - mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x diff --git a/.gitignore b/.gitignore index ed3b94c4..74307e67 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .vscode/ build/ +build_coverage/ __pycache__ .*.swp *.so diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ff7b075..9db54d15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +# Licensed under the MIT License. cmake_minimum_required(VERSION 3.25) project(mscclpp LANGUAGES CXX) @@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_USE_IB "Use InfiniBand." ON) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF) +option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF) option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON) set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).") @@ -99,6 +100,62 @@ else() message(FATAL_ERROR "No compatible GPU found. Set MSCCLPP_USE_CUDA or MSCCLPP_USE_ROCM to ON.") endif() endif() + +# Code coverage setup +if(MSCCLPP_ENABLE_COVERAGE) + if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + message(WARNING "Code coverage results with an optimized (non-Debug) build may be misleading") + endif() + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + message(STATUS "Code coverage enabled") + + # Add coverage flags to C++ targets only (not CUDA) + add_compile_options($<$:--coverage>) + add_compile_options($<$:-O0>) + add_compile_options($<$:-g>) + add_link_options($<$:--coverage>) + + # Find lcov + find_program(LCOV_PATH lcov) + + if(NOT LCOV_PATH) + message(WARNING "lcov not found. Install lcov to generate coverage reports.") + endif() + + if(LCOV_PATH) + # Add coverage target + add_custom_target(coverage + COMMAND ${CMAKE_COMMAND} -E echo "Removing old coverage data..." + COMMAND ${LCOV_PATH} --directory . --zerocounters + + COMMAND ${CMAKE_COMMAND} -E echo "Running tests..." + COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure + + COMMAND ${CMAKE_COMMAND} -E echo "Collecting coverage data..." + COMMAND ${LCOV_PATH} --directory . --capture --output-file coverage.info + + COMMAND ${CMAKE_COMMAND} -E echo "Filtering coverage data..." + COMMAND ${LCOV_PATH} --remove coverage.info '/usr/*' '*/test/*' '*/build/*' --output-file coverage.info + + COMMAND ${CMAKE_COMMAND} -E echo "Coverage report generated in coverage.info" + + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Generating code coverage report" + ) + + # Add coverage clean target + add_custom_target(coverage-clean + COMMAND ${CMAKE_COMMAND} -E remove coverage.info + COMMAND ${LCOV_PATH} --directory . --zerocounters + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Cleaning coverage data" + ) + endif() + else() + message(WARNING "Code coverage is only supported with GCC or Clang compilers") + endif() +endif() if(MSCCLPP_GPU_ARCHS) string(STRIP "${MSCCLPP_GPU_ARCHS}" MSCCLPP_GPU_ARCHS) string(REPLACE " " ";" MSCCLPP_GPU_ARCHS "${MSCCLPP_GPU_ARCHS}") diff --git a/README.md b/README.md index 5366f5b5..58586a30 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,16 @@ [![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest) [![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE) [![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml) -[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yaml/badge.svg)](https://microsoft.github.io/mscclpp/) +[![Docs Build](https://github.com/microsoft/mscclpp/actions/workflows/doc-build.yml/badge.svg)](https://microsoft.github.io/mscclpp/) +[![codecov](https://codecov.io/gh/microsoft/mscclpp/graph/badge.svg?token=DAV9DGHAY2)](https://codecov.io/gh/microsoft/mscclpp) | Testing Pipelines | Build Status | |--------------------------|-------------------| -| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | -| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) | -| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) | -| NCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) | -| RCCL Tests | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) | +| Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestH100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | +| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main&jobName=UnitTestMI300X)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | +| Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main&jobName=Integration%20test%20H100)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) | +| NCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-nccl?repoName=microsoft%2Fmscclpp&branchName=main&jobName=Run%20MSCCLPP%20over%20NCCL%20Test%20(H100))](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) | +| RCCL Tests | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-rccl?branchName=main&jobName=Run%20MSCCLPP%20over%20RCCL%20Test%20(MI300X))](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) | A GPU-driven communication stack for scalable AI applications. diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 3aa81422..7c6c927e 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp RUN apt-get update && \ apt-get install -y --no-install-recommends \ htop \ - lcov \ vim \ && \ apt-get autoremove -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* +# Install lcov 2.2 +RUN LCOV_VERSION="2.2" && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + cpanminus \ + gcc \ + make \ + perl \ + && \ + cpanm --notest \ + Capture::Tiny \ + DateTime \ + JSON::XS \ + Memory::Process \ + TimeDate \ + && \ + cd /tmp && \ + curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \ + tar xzf lcov.tar.gz && \ + cd lcov-${LCOV_VERSION} && \ + make install && \ + cd / && rm -rf /tmp/lcov* && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + # Install CMake 3.26.4 RUN OS_ARCH=$(uname -m) && \ CMAKE_VERSION="3.26.4" && \ @@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \ fi && \ pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r python/requirements_${target_type}.txt + pip install --no-cache-dir -r python/requirements_${target_type}.txt && \ + pip install --no-cache-dir coverage xlsxwriter # Cleanup RUN rm -rf /tmp/mscclpp diff --git a/docker/build.sh b/docker/build.sh index 63552f74..89568e19 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -4,27 +4,27 @@ set -e declare -A baseImageTable baseImageTable=( - ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" - ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" - ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" - ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" + ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04" ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" - ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04" + ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04" ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04" ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2" ) declare -A extraLdPathTable extraLdPathTable=( - ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" - ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" - ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" + ["cuda11.8"]="/usr/local/cuda-11.8/compat" + ["cuda12.4"]="/usr/local/cuda-12.4/compat" + ["cuda12.8"]="/usr/local/cuda-12.8/compat" + ["cuda12.9"]="/usr/local/cuda-12.9/compat" + ["cuda13.0"]="/usr/local/cuda-13.0/compat" ["rocm6.2"]="/opt/rocm/lib" ) declare -A ofedVersionTable ofedVersionTable=( + ["cuda11.8"]="23.07-0.5.1.2" ["cuda12.4"]="23.07-0.5.1.2" ["cuda12.8"]="24.10-1.1.4.0" ["cuda12.9"]="24.10-1.1.4.0" @@ -36,7 +36,7 @@ TARGET=${1} OS_ARCH=$(uname -m) print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" + echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then diff --git a/docs/quickstart.md b/docs/quickstart.md index ac1b7d6b..b7a68050 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se ```bash # For NVIDIA platforms -$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash +$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash # For AMD platforms $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash ``` @@ -171,7 +171,6 @@ We implement [NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/ap For example, you can run [nccl-tests](https://github.com/NVIDIA/nccl-tests) using `libmscclpp_nccl.so` as follows, where `MSCCLPP_BUILD` is your MSCCL++ build directory. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` @@ -189,13 +188,11 @@ By default, if the parameter `MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION` is not spec Example 1, Allreduce will fallback to NCCL ncclAllReduce since allreduce is in the fallback list. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce,allgather" ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist. ```bash -export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6452ebf8..82b799dc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,13 +1,12 @@ # Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +# Licensed under the MIT License. -find_package(MPI) +find_package(MPI REQUIRED) set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads) if(MSCCLPP_USE_IB) list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES}) endif() -set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main) set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include) @@ -17,6 +16,7 @@ if(MSCCLPP_USE_ROCM) foreach(arch ${MSCCLPP_GPU_ARCHS}) add_compile_options(--offload-arch=${arch}) endforeach() + add_compile_definitions(__HIP_PLATFORM_AMD__) endif() function(add_test_executable name sources) @@ -38,28 +38,25 @@ add_test_executable(executor_test executor_test.cc) configure_file(run_mpi_test.sh.in run_mpi_test.sh) include(CTest) -include(FetchContent) -FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip) -option(INSTALL_GTEST OFF) -FetchContent_MakeAvailable(googletest) -include(GoogleTest) + +# Build test framework library +add_library(test_framework STATIC framework.cc) +target_include_directories(test_framework PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${TEST_INC_COMMON}) +target_link_libraries(test_framework PUBLIC MPI::MPI_CXX) # Unit tests add_executable(unit_tests) -target_link_libraries(unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST}) +target_link_libraries(unit_tests ${TEST_LIBS_COMMON} test_framework) target_include_directories(unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL}) add_subdirectory(unit) -gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST) +add_test(NAME unit_tests COMMAND unit_tests) # Multi-process unit tests add_executable(mp_unit_tests) -target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} ${TEST_LIBS_GTEST} MPI::MPI_CXX) +target_link_libraries(mp_unit_tests ${TEST_LIBS_COMMON} test_framework MPI::MPI_CXX) target_include_directories(mp_unit_tests ${TEST_INC_COMMON} ${TEST_INC_INTERNAL}) add_subdirectory(mp_unit) -gtest_discover_tests(mp_unit_tests DISCOVERY_MODE PRE_TEST) +add_test(NAME mp_unit_tests COMMAND ${CMAKE_CURRENT_BINARY_DIR}/run_mpi_test.sh mp_unit_tests 2) # mscclpp-test add_subdirectory(mscclpp-test) - -# Performance tests -add_subdirectory(perf) diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index b26ff1a8..1f1d0e52 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -1,4 +1,4 @@ -set -e +set -ex TEST_NAME=$1 IB_ENVIRONMENT="${2:-true}" diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh new file mode 100755 index 00000000..b646ea92 --- /dev/null +++ b/test/deploy/run-remote.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Run a command on remote CI VMs via parallel-ssh. +# By default, runs inside the mscclpp-test docker container. +# +# Usage: +# run-remote.sh [OPTIONS] < +# +# Options: +# --no-docker Run command directly on the host, not inside docker +# --no-log Don't tail the log file in the background +# --hostfile Override hostfile path (default: test/deploy/hostfile_ci) +# --host Run command on a single host (uses parallel-ssh -H) +# --user SSH user when using --host or custom hostfile + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HOSTFILE="${SCRIPT_DIR}/hostfile_ci" +SSH_OPTION="StrictHostKeyChecking=no" +KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}" + +USE_DOCKER=true +USE_LOG=true +TARGET_HOST="" +REMOTE_USER="" + +usage() { + echo "Usage: $0 [--no-docker] [--no-log] [--hostfile ] [--host ] [--user ] < " >&2 +} + +require_value() { + local opt="$1" + local val="$2" + if [ -z "$val" ]; then + echo "Missing value for ${opt}" >&2 + exit 1 + fi +} + +while [[ "$1" == --* ]]; do + case "$1" in + --no-docker) USE_DOCKER=false; shift ;; + --no-log) USE_LOG=false; shift ;; + --hostfile) + require_value "--hostfile" "${2-}" + HOSTFILE="$2" + shift 2 + ;; + --host) + require_value "--host" "${2-}" + TARGET_HOST="$2" + shift 2 + ;; + --user) + require_value "--user" "${2-}" + REMOTE_USER="$2" + shift 2 + ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac +done + +if [ $# -ne 0 ] || [ -t 0 ]; then + usage + exit 1 +fi + +CMD=$(cat) +if [ -z "$CMD" ]; then + usage + exit 1 +fi +CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n') + +PSSH_TARGET_ARGS=() +if [ -n "$TARGET_HOST" ]; then + PSSH_TARGET_ARGS=(-H "$TARGET_HOST") +else + PSSH_TARGET_ARGS=(-h "$HOSTFILE") +fi + +PSSH_USER_ARGS=() +if [ -n "$REMOTE_USER" ]; then + PSSH_USER_ARGS=(-l "$REMOTE_USER") +fi + +PSSH_COMMON=( + -t 0 + "${PSSH_TARGET_ARGS[@]}" + "${PSSH_USER_ARGS[@]}" + -x "-i ${KeyFilePath}" + -O "$SSH_OPTION" +) + +if $USE_DOCKER; then + INNER="set -euxo pipefail;" + INNER+=" cd /root/mscclpp;" + INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;" + INNER+=" CMD_B64='${CMD_B64}';" + INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail" + + parallel-ssh -i "${PSSH_COMMON[@]}" \ + "sudo docker exec mscclpp-test bash -c \"${INNER}\"" +else + parallel-ssh -i "${PSSH_COMMON[@]}" \ + "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail" +fi diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index 488fa81f..0c05a090 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -1,6 +1,5 @@ set -e HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi -export PATH=/usr/local/mpi/bin:$PATH function run_mscclpp_test() { diff --git a/test/executor_test.cc b/test/executor_test.cc index 0e7869ab..2378e7ff 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -93,11 +93,8 @@ double benchTime(int rank, std::shared_ptr bootstrap, std::s int main(int argc, char* argv[]) { if (argc != 5 && argc != 6) { - std::cerr << "Usage: " << argv[0] << " " - << " " - << " " - << " " - << " (optional) " << std::endl; + std::cerr << "Usage: " << argv[0] << " " + << " (optional) " << std::endl; return 1; } @@ -142,7 +139,8 @@ int main(int argc, char* argv[]) { NpKit::Shutdown(); } - std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << deltaSec * 1.e6 << " us" << std::endl; + double latencyUs = deltaSec * 1.e6; + std::cout << "Rank " << rank << ": " << bufferSize << " bytes " << latencyUs << " us" << std::endl; MPI_Finalize(); return 0; } diff --git a/test/framework.cc b/test/framework.cc new file mode 100644 index 00000000..73cf1272 --- /dev/null +++ b/test/framework.cc @@ -0,0 +1,323 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "framework.hpp" + +#include +#include +#include +#include +#include + +namespace mscclpp { +namespace test { + +// Global state +static int gMpiRank = 0; +static int gMpiSize = 1; +static bool gMpiInitialized = false; +static bool gCurrentTestPassed = true; +static std::string gCurrentTestFailureMessage; +static std::string gCurrentTestName; + +std::string currentTestName() { return gCurrentTestName; } + +namespace utils { + +void initializeMPI(int argc, char* argv[]) { + if (gMpiInitialized) return; + + int initialized = 0; + MPI_Initialized(&initialized); + if (!initialized) { + MPI_Init(&argc, &argv); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &gMpiRank); + MPI_Comm_size(MPI_COMM_WORLD, &gMpiSize); + gMpiInitialized = true; +} + +static void finalizeMPI() { + if (!gMpiInitialized) return; + + MPI_Finalize(); + gMpiInitialized = false; +} + +bool isMainRank() { return gMpiRank == 0; } + +int getMPIRank() { return gMpiRank; } + +int getMPISize() { return gMpiSize; } + +void cleanupMPI() { finalizeMPI(); } + +void reportFailure(const char* file, int line, const std::string& message) { + gCurrentTestPassed = false; + std::ostringstream oss; + oss << file << ":" << line << ": " << message; + if (!gCurrentTestFailureMessage.empty()) { + gCurrentTestFailureMessage += "\n"; + } + gCurrentTestFailureMessage += oss.str(); + std::cerr << oss.str() << std::endl; +} + +void reportSuccess() { + gCurrentTestPassed = true; + gCurrentTestFailureMessage.clear(); +} + +// Timer implementation +Timer::Timer() : isRunning_(false) {} + +void Timer::start() { + startTime_ = std::chrono::high_resolution_clock::now(); + isRunning_ = true; +} + +void Timer::stop() { + endTime_ = std::chrono::high_resolution_clock::now(); + isRunning_ = false; +} + +double Timer::elapsedMicroseconds() const { + if (isRunning_) { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now - startTime_).count(); + } + return std::chrono::duration_cast(endTime_ - startTime_).count(); +} + +double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; } + +double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; } + +void cudaCheck(cudaError_t err, const char* file, int line) { + if (err != cudaSuccess) { + std::string msg = + std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err); + throw std::runtime_error(msg); + } +} + +} // namespace utils + +// TestRegistry implementation +TestRegistry& TestRegistry::instance() { + static TestRegistry registry; + return registry; +} + +void TestRegistry::registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory, + bool isPerfTest) { + tests_.push_back({suiteName, testName, std::move(factory), isPerfTest}); +} + +void TestRegistry::addEnvironment(Environment* env) { environments_.push_back(env); } + +// Returns true if the test should run given the filter string. +// Filter syntax: +// "" -> run all +// "Pattern" -> run only tests whose full name contains Pattern +// "-Pattern" -> run all tests EXCEPT those whose full name contains Pattern +static bool matchesFilter(const std::string& fullName, const std::string& filter) { + if (filter.empty()) return true; + if (filter[0] == '-') { + // Negative filter: exclude tests matching any comma-separated pattern + std::string patterns = filter.substr(1); + size_t pos = 0; + while (pos < patterns.size()) { + size_t comma = patterns.find(',', pos); + std::string pattern = (comma == std::string::npos) ? patterns.substr(pos) : patterns.substr(pos, comma - pos); + if (!pattern.empty() && fullName.find(pattern) != std::string::npos) { + return false; + } + pos = (comma == std::string::npos) ? patterns.size() : comma + 1; + } + return true; + } + // Positive filter: include only matching tests + return fullName.find(filter) != std::string::npos; +} + +int TestRegistry::runAllTests(int argc, char* argv[]) { + // Initialize MPI if not already initialized + if (!gMpiInitialized) { + utils::initializeMPI(argc, argv); + } + + // Parse command line arguments + std::string filter; + bool excludePerfTests = false; + + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg.find("--filter=") == 0) { + filter = arg.substr(9); // Length of "--filter=" + } else if (arg == "--filter" && i + 1 < argc) { + filter = argv[i + 1]; + ++i; + } else if (arg == "--exclude-perf-tests") { + excludePerfTests = true; + } + } + + // Set up global test environments + for (auto* env : environments_) { + try { + env->SetUp(); + } catch (const std::exception& e) { + if (gMpiRank == 0) { + std::cerr << "Failed to set up test environment: " << e.what() << std::endl; + } + return 1; + } + } + + int passed = 0; + int failed = 0; + int skipped = 0; + + // Count tests to run + int totalToRun = 0; + int skippedByFilter = 0; + for (const auto& entry : tests_) { + std::string fullName = entry.suiteName + "." + entry.testName; + if (excludePerfTests && entry.isPerfTest) { + skippedByFilter++; + continue; + } + if (!matchesFilter(fullName, filter)) { + skippedByFilter++; + continue; + } + totalToRun++; + } + + if (gMpiRank == 0) { + std::cout << "[==========] Running " << totalToRun << " tests"; + if (skippedByFilter > 0) { + std::cout << " (" << skippedByFilter << " skipped by filter)"; + } + std::cout << ".\n"; + } + + for (const auto& entry : tests_) { + std::string fullName = entry.suiteName + "." + entry.testName; + + if (excludePerfTests && entry.isPerfTest) continue; + if (!matchesFilter(fullName, filter)) continue; + + gCurrentTestPassed = true; + gCurrentTestFailureMessage.clear(); + gCurrentTestName = fullName; + + if (gMpiRank == 0) { + std::cout << "[ RUN ] " << fullName << std::endl; + } + + TestCase* testCase = nullptr; + bool testSkipped = false; + bool setUpSucceeded = false; + try { + testCase = entry.factory(); + testCase->SetUp(); + setUpSucceeded = true; + testCase->TestBody(); + } catch (const SkipException& e) { + gCurrentTestPassed = true; + testSkipped = true; + if (gMpiRank == 0) { + std::cout << "[ SKIPPED ] " << fullName << ": " << e.what() << std::endl; + } + } catch (const std::exception& e) { + gCurrentTestPassed = false; + if (gCurrentTestFailureMessage.empty()) { + gCurrentTestFailureMessage = e.what(); + } + } catch (...) { + gCurrentTestPassed = false; + if (gCurrentTestFailureMessage.empty()) { + gCurrentTestFailureMessage = "Unknown exception"; + } + } + + // Always call TearDown() if SetUp() succeeded, even if TestBody() threw + if (setUpSucceeded && testCase != nullptr) { + try { + testCase->TearDown(); + } catch (const std::exception& e) { + // If test already failed, keep original failure message + if (gCurrentTestPassed) { + gCurrentTestPassed = false; + gCurrentTestFailureMessage = std::string("TearDown() failed: ") + e.what(); + } + } catch (...) { + if (gCurrentTestPassed) { + gCurrentTestPassed = false; + gCurrentTestFailureMessage = "TearDown() failed with unknown exception"; + } + } + } + + delete testCase; + gCurrentTestName.clear(); + + if (testSkipped) { + skipped++; + continue; + } + + // Synchronize test status across all MPI processes + int localPassed = gCurrentTestPassed ? 1 : 0; + int globalPassed = 1; + if (gMpiInitialized) { + MPI_Allreduce(&localPassed, &globalPassed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + } else { + globalPassed = localPassed; + } + + if (gMpiRank == 0) { + if (globalPassed) { + std::cout << "[ OK ] " << fullName << std::endl; + passed++; + } else { + std::cout << "[ FAILED ] " << fullName << std::endl; + failed++; + } + } + } + + if (gMpiRank == 0) { + std::cout << "[==========] " << totalToRun << " tests ran.\n"; + if (passed > 0) { + std::cout << "[ PASSED ] " << passed << " tests.\n"; + } + if (skipped > 0) { + std::cout << "[ SKIPPED ] " << skipped << " tests.\n"; + } + if (failed > 0) { + std::cout << "[ FAILED ] " << failed << " tests.\n"; + } + } + + // Tear down global test environments (in reverse order) + for (auto it = environments_.rbegin(); it != environments_.rend(); ++it) { + try { + (*it)->TearDown(); + delete *it; + } catch (const std::exception& e) { + if (gMpiRank == 0) { + std::cerr << "Failed to tear down test environment: " << e.what() << std::endl; + } + } + } + environments_.clear(); + + return failed > 0 ? 1 : 0; +} + +} // namespace test +} // namespace mscclpp diff --git a/test/framework.hpp b/test/framework.hpp new file mode 100644 index 00000000..26a32d5b --- /dev/null +++ b/test/framework.hpp @@ -0,0 +1,405 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_TEST_FRAMEWORK_HPP_ +#define MSCCLPP_TEST_FRAMEWORK_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mscclpp { +namespace test { + +// Test case base class +class TestCase { + public: + virtual ~TestCase() = default; + virtual void SetUp() {} + virtual void TearDown() {} + virtual void TestBody() = 0; +}; + +// Environment base class (for global test setup/teardown) +class Environment { + public: + virtual ~Environment() = default; + virtual void SetUp() {} + virtual void TearDown() {} +}; + +// Test registry and runner +class TestRegistry { + public: + using TestFactory = std::function; + + static TestRegistry& instance(); + + void registerTest(const std::string& suiteName, const std::string& testName, TestFactory factory, + bool isPerfTest = false); + void addEnvironment(Environment* env); + int runAllTests(int argc, char* argv[]); + + private: + TestRegistry() = default; + struct TestEntry { + std::string suiteName; + std::string testName; + TestFactory factory; + bool isPerfTest; + }; + std::vector tests_; + std::vector environments_; +}; + +// Returns "Suite.Name" for the currently running test, or "" if none. +std::string currentTestName(); + +// Utility functions +namespace utils { + +// MPI management +void initializeMPI(int argc, char* argv[]); +void cleanupMPI(); +bool isMainRank(); +int getMPIRank(); +int getMPISize(); + +// Timing utilities +class Timer { + public: + Timer(); + void start(); + void stop(); + double elapsedMicroseconds() const; + double elapsedMilliseconds() const; + double elapsedSeconds() const; + + private: + std::chrono::high_resolution_clock::time_point startTime_; + std::chrono::high_resolution_clock::time_point endTime_; + bool isRunning_; +}; + +// CUDA utilities +void cudaCheck(cudaError_t err, const char* file, int line); +#define CUDA_CHECK(call) mscclpp::test::utils::cudaCheck(call, __FILE__, __LINE__) + +// Test assertion helpers +void reportFailure(const char* file, int line, const std::string& message); +void reportSuccess(); + +} // namespace utils + +// Exception for test skips +class SkipException : public std::runtime_error { + public: + explicit SkipException(const std::string& message) : std::runtime_error(message) {} +}; + +// Helper class for FAIL() macro — supports message streaming via operator<< +class FailHelper { + public: + explicit FailHelper(const char* file, int line) : file_(file), line_(line) {} + template + FailHelper& operator<<(const T& value) { + message_ << value; + return *this; + } + ~FailHelper() noexcept(false) { + std::string msg = message_.str(); + if (!msg.empty()) { + ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed: " + msg); + } else { + ::mscclpp::test::utils::reportFailure(file_, line_, "Test failed"); + } + throw std::runtime_error("Test failed"); + } + + private: + const char* file_; + int line_; + std::ostringstream message_; +}; + +// Helper class for SKIP_TEST() macro — supports message streaming via operator<< +// Usage: SKIP_TEST() << "Reason for skipping"; +class SkipHelper { + public: + explicit SkipHelper(const char* file, int line) : file_(file), line_(line) {} + template + SkipHelper& operator<<(const T& value) { + message_ << value; + return *this; + } + ~SkipHelper() noexcept(false) { + std::string msg = message_.str(); + if (!msg.empty()) { + throw SkipException("Test skipped: " + msg); + } else { + throw SkipException("Test skipped"); + } + } + + private: + const char* file_; + int line_; + std::ostringstream message_; +}; + +// SFINAE helper: resolves to T if T is a complete type (user-defined fixture), +// otherwise falls back to TestCase. This lets TEST() work with or without a fixture class. +namespace detail { +template +using void_t = void; + +template > +struct FixtureOf { + using type = TestCase; +}; +template +struct FixtureOf> { + using type = T; +}; +} // namespace detail + +} // namespace test +} // namespace mscclpp + +// --- Test registration macros --- +// TEST(Suite, Name): if Suite is a previously-defined class, the test inherits from it (fixture). +// Otherwise, the test inherits from TestCase (no fixture needed). + +#define TEST(test_fixture, test_name) \ + class test_fixture; \ + class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf::type { \ + public: \ + void TestBody() override; \ + }; \ + static bool test_fixture##_##test_name##_registered = []() { \ + ::mscclpp::test::TestRegistry::instance().registerTest( \ + #test_fixture, #test_name, \ + []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }); \ + return true; \ + }(); \ + void test_fixture##_##test_name##_Test::TestBody() + +#define PERF_TEST(test_fixture, test_name) \ + class test_fixture; \ + class test_fixture##_##test_name##_Test : public ::mscclpp::test::detail::FixtureOf::type { \ + public: \ + void TestBody() override; \ + }; \ + static bool test_fixture##_##test_name##_registered = []() { \ + ::mscclpp::test::TestRegistry::instance().registerTest( \ + #test_fixture, #test_name, \ + []() -> ::mscclpp::test::TestCase* { return new test_fixture##_##test_name##_Test(); }, true); \ + return true; \ + }(); \ + void test_fixture##_##test_name##_Test::TestBody() + +// --- Test runner macro --- +#define RUN_ALL_TESTS() ::mscclpp::test::TestRegistry::instance().runAllTests(argc, argv) + +// Assertion macros +#define EXPECT_TRUE(condition) \ + do { \ + if (!(condition)) { \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \ + } \ + } while (0) + +#define EXPECT_FALSE(condition) \ + do { \ + if (condition) { \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \ + } \ + } while (0) + +#define EXPECT_EQ(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 == v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " == " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define EXPECT_NE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 != v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " != " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define EXPECT_LT(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 < v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " < " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define EXPECT_LE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 <= v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " <= " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define EXPECT_GT(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 > v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " > " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define EXPECT_GE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 >= v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " >= " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + } \ + } while (0) + +#define ASSERT_TRUE(condition) \ + do { \ + if (!(condition)) { \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be true"); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_FALSE(condition) \ + do { \ + if (condition) { \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, "Expected: " #condition " to be false"); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_EQ(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 == v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " == " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_NE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 != v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " != " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_LT(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 < v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " < " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_LE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 <= v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " <= " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_GT(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 > v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " > " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_GE(val1, val2) \ + do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 >= v2)) { \ + std::ostringstream oss; \ + oss << "Expected: " #val1 " >= " #val2 << "\n Actual: " << v1 << " vs " << v2; \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +#define ASSERT_NO_THROW(statement) \ + do { \ + try { \ + statement; \ + } catch (const std::exception& e) { \ + std::ostringstream oss; \ + oss << "Expected: " #statement " not to throw\n Actual: threw " << e.what(); \ + ::mscclpp::test::utils::reportFailure(__FILE__, __LINE__, oss.str()); \ + throw std::runtime_error("Test assertion failed"); \ + } catch (...) { \ + ::mscclpp::test::utils::reportFailure( \ + __FILE__, __LINE__, "Expected: " #statement " not to throw\n Actual: threw unknown exception"); \ + throw std::runtime_error("Test assertion failed"); \ + } \ + } while (0) + +// --- Test control macros --- + +// Fail the current test immediately. Usage: FAIL() << "reason"; +#define FAIL() ::mscclpp::test::FailHelper(__FILE__, __LINE__) + +// Skip the current test. Usage: SKIP_TEST() << "reason"; +#define SKIP_TEST() ::mscclpp::test::SkipHelper(__FILE__, __LINE__) + +#endif // MSCCLPP_TEST_FRAMEWORK_HPP_ diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index 4bbab2f1..c28087a4 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include @@ -48,7 +48,7 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr bootstr bootstrapTestSendRecv(bootstrap); } -TEST_F(BootstrapTest, WithId) { +TEST(BootstrapTest, WithId) { auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); @@ -57,13 +57,13 @@ TEST_F(BootstrapTest, WithId) { bootstrapTestAll(bootstrap); } -TEST_F(BootstrapTest, WithIpPortPair) { +TEST(BootstrapTest, WithIpPortPair) { auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); bootstrap->initialize(gEnv->args["ip_port"]); bootstrapTestAll(bootstrap); } -TEST_F(BootstrapTest, ResumeWithId) { +TEST(BootstrapTest, ResumeWithId) { // This test may take a few minutes. bootstrapTestTimer.set(300); @@ -76,19 +76,19 @@ TEST_F(BootstrapTest, ResumeWithId) { } } -TEST_F(BootstrapTest, ResumeWithIpPortPair) { +TEST(BootstrapTest, ResumeWithIpPortPair) { for (int i = 0; i < 5; ++i) { auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); bootstrap->initialize(gEnv->args["ip_port"]); } } -TEST_F(BootstrapTest, ExitBeforeConnect) { +TEST(BootstrapTest, ExitBeforeConnect) { auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); bootstrap->createUniqueId(); } -TEST_F(BootstrapTest, TimeoutWithId) { +TEST(BootstrapTest, TimeoutWithId) { mscclpp::Timer timer; // All ranks initialize a bootstrap with their own id (will hang) @@ -99,7 +99,7 @@ TEST_F(BootstrapTest, TimeoutWithId) { // Set bootstrap timeout to 1 second bootstrap->initialize(id, 1); } catch (const mscclpp::Error& e) { - ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout); + ASSERT_TRUE(e.getErrorCode() == mscclpp::ErrorCode::Timeout); } // Timeout should be sligtly greater than 1 second @@ -139,7 +139,7 @@ class MPIBootstrap : public mscclpp::Bootstrap { } }; -TEST_F(BootstrapTest, MPIBootstrap) { +TEST(BootstrapTest, MPIBootstrap) { auto bootstrap = std::make_shared(); bootstrapTestAll(bootstrap); } diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu index 9d83532a..066c5514 100644 --- a/test/mp_unit/communicator_tests.cu +++ b/test/mp_unit/communicator_tests.cu @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include @@ -185,7 +185,7 @@ bool CommunicatorTest::testWriteCorrectness(bool skipLocal) { return true; } -TEST_F(CommunicatorTest, BasicWrite) { +TEST(CommunicatorTest, BasicWrite) { if (gEnv->rank >= numRanksToUse) return; deviceBufferInit(); @@ -215,7 +215,7 @@ __global__ void kernelWaitSemaphores(mscclpp::Host2DeviceSemaphore::DeviceHandle } } -TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) { +TEST(CommunicatorTest, WriteWithDeviceSemaphores) { if (gEnv->rank >= numRanksToUse) return; std::unordered_map> semaphores; @@ -254,7 +254,7 @@ TEST_F(CommunicatorTest, WriteWithDeviceSemaphores) { communicator->bootstrap()->barrier(); } -TEST_F(CommunicatorTest, WriteWithHostSemaphores) { +TEST(CommunicatorTest, WriteWithHostSemaphores) { if (gEnv->rank >= numRanksToUse) return; std::unordered_map> semaphores; diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc index a903ed08..4f3f2545 100644 --- a/test/mp_unit/executor_tests.cc +++ b/test/mp_unit/executor_tests.cc @@ -1,7 +1,8 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include +#include #include #include @@ -22,7 +23,7 @@ std::string getExecutablePath() { void ExecutorTest::SetUp() { if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) { - GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2"; + SKIP_TEST() << "This test requires world size to be 2 and ranks per node to be 2"; } MultiProcessTest::SetUp(); @@ -49,7 +50,7 @@ void ExecutorTest::TearDown() { MultiProcessTest::TearDown(); } -TEST_F(ExecutorTest, TwoNodesAllreduce) { +TEST(ExecutorTest, TwoNodesAllreduce) { std::string executablePath = getExecutablePath(); std::filesystem::path path = executablePath; std::filesystem::path executionFilesPath = diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu index 051030ac..04ab402d 100644 --- a/test/mp_unit/ib_tests.cu +++ b/test/mp_unit/ib_tests.cu @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include @@ -18,9 +18,7 @@ void IbTestBase::SetUp() { } void IbPeerToPeerTest::SetUp() { -#if !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) + REQUIRE_IBVERBS; IbTestBase::SetUp(); @@ -80,7 +78,7 @@ void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint6 qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData); } -TEST_F(IbPeerToPeerTest, SimpleSendRecv) { +TEST(IbPeerToPeerTest, SimpleSendRecv) { if (gEnv->rank >= 2) { // This test needs only two ranks return; @@ -195,7 +193,7 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt } } -TEST_F(IbPeerToPeerTest, MemoryConsistency) { +TEST(IbPeerToPeerTest, MemoryConsistency) { if (gEnv->rank >= 2) { // This test needs only two ranks return; @@ -303,7 +301,7 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) { EXPECT_EQ(res, 0); } -TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) { +TEST(IbPeerToPeerTest, SimpleAtomicAdd) { if (gEnv->rank >= 2) { // This test needs only two ranks return; diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu index f6ef3aed..318d301a 100644 --- a/test/mp_unit/memory_channel_tests.cu +++ b/test/mp_unit/memory_channel_tests.cu @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include @@ -8,7 +8,7 @@ void MemoryChannelOneToOneTest::SetUp() { // Need at least two ranks within a node if (gEnv->nRanksPerNode < 2) { - GTEST_SKIP(); + SKIP_TEST(); } // Use only two ranks setNumRanksToUse(2); @@ -88,27 +88,12 @@ void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName, std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); // The least nelem is 2 for packet ping pong - kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - *ret = 0; - - kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; + for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) { + *ret = 0; + kernelWrapper(buff.get(), gEnv->rank, nElem, ret.get(), defaultNTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + EXPECT_EQ(*ret, 0); + } int nTries = 1000000; communicator->bootstrap()->barrier(); @@ -169,7 +154,7 @@ __global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) { } } -TEST_F(MemoryChannelOneToOneTest, PutPingPong) { +TEST(MemoryChannelOneToOneTest, PutPingPong) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; @@ -187,28 +172,12 @@ TEST_F(MemoryChannelOneToOneTest, PutPingPong) { std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); - kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); + for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) { + *ret = 0; + kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get()); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + EXPECT_EQ(*ret, 0); + } } __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) { @@ -248,7 +217,7 @@ __global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) { } } -TEST_F(MemoryChannelOneToOneTest, GetPingPong) { +TEST(MemoryChannelOneToOneTest, GetPingPong) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; @@ -266,28 +235,12 @@ TEST_F(MemoryChannelOneToOneTest, GetPingPong) { std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); - kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); + for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) { + *ret = 0; + kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, ret.get()); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + EXPECT_EQ(*ret, 0); + } } __global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { @@ -371,14 +324,14 @@ __global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int* } } -TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) { +TEST(MemoryChannelOneToOneTest, LL8PacketPingPong) { auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper); } -TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) { +TEST(MemoryChannelOneToOneTest, LL16PacketPingPong) { auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc index cafd9bbc..2f6dc1ca 100644 --- a/test/mp_unit/mp_unit_tests.cc +++ b/test/mp_unit/mp_unit_tests.cc @@ -98,14 +98,18 @@ static std::unordered_map parseArgs(int argc, const ch continue; } - // Unrecognized positional token: ignore to keep parser permissive for gtest/MPI extras + // Unrecognized positional token: ignore } return options; } void MultiProcessTestEnv::SetUp() { - MPI_Init(NULL, NULL); + int initialized = 0; + MPI_Initialized(&initialized); + if (!initialized) { + MPI_Init(NULL, NULL); + } MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &worldSize); // get the local number of nodes with MPI @@ -128,18 +132,17 @@ void MultiProcessTest::TearDown() { } int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); gEnv = new MultiProcessTestEnv(argc, (const char**)argv); - ::testing::AddGlobalTestEnvironment(gEnv); + ::mscclpp::test::TestRegistry::instance().addEnvironment(gEnv); return RUN_ALL_TESTS(); } -TEST_F(MultiProcessTest, Prelim) { +TEST(MultiProcessTest, Prelim) { // Test to make sure the MPI environment is set up correctly ASSERT_GE(gEnv->worldSize, 2); } -TEST_F(MultiProcessTest, HostName) { +TEST(MultiProcessTest, HostName) { const size_t maxNameLen = 1024; std::vector buffer(gEnv->worldSize * maxNameLen, '\0'); std::string hostName = mscclpp::getHostName(maxNameLen, '\0'); @@ -159,7 +162,7 @@ TEST_F(MultiProcessTest, HostName) { } } -TEST_F(MultiProcessTest, HostHash) { +TEST(MultiProcessTest, HostHash) { std::vector buffer(gEnv->worldSize, 0); uint64_t hostHash = mscclpp::getHostHash(); buffer[gEnv->rank] = hostHash; diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index 17046a57..03e4cbde 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -4,8 +4,6 @@ #ifndef MSCCLPP_MP_UNIT_TESTS_HPP_ #define MSCCLPP_MP_UNIT_TESTS_HPP_ -#include - #include #include #include @@ -13,10 +11,18 @@ #include #include +#include "../framework.hpp" #include "ib.hpp" #include "utils_internal.hpp" -class MultiProcessTestEnv : public ::testing::Environment { +// Skip the current test if IBVerbs is not available in this build +#if defined(USE_IBVERBS) +#define REQUIRE_IBVERBS +#else +#define REQUIRE_IBVERBS SKIP_TEST() << "This test requires IBVerbs that the current build does not support." +#endif + +class MultiProcessTestEnv : public ::mscclpp::test::Environment { public: MultiProcessTestEnv(int argc, const char** argv); @@ -37,7 +43,7 @@ mscclpp::Transport ibIdToTransport(int id); int rankToLocalRank(int rank); int rankToNode(int rank); -class MultiProcessTest : public ::testing::Test { +class MultiProcessTest : public ::mscclpp::test::TestCase { protected: void TearDown() override; }; diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu index 7cc5954a..764c3299 100644 --- a/test/mp_unit/port_channel_tests.cu +++ b/test/mp_unit/port_channel_tests.cu @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include #include @@ -178,26 +178,12 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) { std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); const int nTries = 1000; - - kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, params.waitWithPoll, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - - kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, params.waitWithPoll, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - - kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, params.waitWithPoll, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - - kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, params.waitWithPoll, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); + for (int nElem : {1, 1024, 1024 * 1024, 4 * 1024 * 1024}) { + *ret = 0; + kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, nElem, params.waitWithPoll, nTries, ret.get()); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + EXPECT_EQ(*ret, 0); + } proxyService->stopProxy(); } @@ -223,8 +209,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); - auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info(); - const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name()); + const std::string testName = ::mscclpp::test::currentTestName(); const int nTries = 1000; // Warm-up @@ -247,63 +232,51 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { proxyService->stopProxy(); } -TEST_F(PortChannelOneToOneTest, PingPong) { +TEST(PortChannelOneToOneTest, PingPong) { testPingPong(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PingPongIbHostMode) { + REQUIRE_IBVERBS; testPingPong(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host}); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PingPongEthernet) { +TEST(PortChannelOneToOneTest, PingPongEthernet) { testPingPong(PingPongTestParams{ .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongWithPoll) { +TEST(PortChannelOneToOneTest, PingPongWithPoll) { testPingPong(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) { + REQUIRE_IBVERBS; testPingPong(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host}); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PingPongPerf) { +TEST(PortChannelOneToOneTest, PingPongPerf) { testPingPongPerf(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PingPongPerfIbHostMode) { + REQUIRE_IBVERBS; testPingPongPerf(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host}); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) { + REQUIRE_IBVERBS; testPingPongPerf(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic}); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) { +TEST(PortChannelOneToOneTest, PingPongPerfEthernet) { testPingPongPerf(PingPongTestParams{ .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default}); } @@ -407,34 +380,14 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) { std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); const int nTries = 1000; - // The least nelem is 2 for packet ping pong - kernelProxyLLPingPong - <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 2, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelProxyLLPingPong - <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, 1024, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelProxyLLPingPong<<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, - 1024 * 1024, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelProxyLLPingPong<<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, - 4 * 1024 * 1024, nTries, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); + for (int nElem : {2, 1024, 1024 * 1024, 4 * 1024 * 1024}) { + *ret = 0; + kernelProxyLLPingPong + <<<1, 1024>>>(buff.get(), putPacketBuffer.get(), getPacketBuffer.get(), gEnv->rank, nElem, nTries, ret.get()); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + EXPECT_EQ(*ret, 0); + } communicator->bootstrap()->barrier(); @@ -471,8 +424,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) proxyService->startProxy(); - auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info(); - const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name()); + const std::string testName = ::mscclpp::test::currentTestName(); const int nTries = 1000000; // Warm-up @@ -497,47 +449,32 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) proxyService->stopProxy(); } -TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); } +TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); } -TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) { + REQUIRE_IBVERBS; testPacketPingPong(true, IbMode::Host); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); } +TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); } -TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) { + REQUIRE_IBVERBS; testPacketPingPongPerf(true, IbMode::Host); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) { + REQUIRE_IBVERBS; testPacketPingPongPerf(true, IbMode::HostNoAtomic); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) { + REQUIRE_IBVERBS; testPingPong(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic}); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) { -#if defined(USE_IBVERBS) +TEST(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) { + REQUIRE_IBVERBS; testPacketPingPong(true, IbMode::HostNoAtomic); -#else // !defined(USE_IBVERBS) - GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; -#endif // !defined(USE_IBVERBS) } diff --git a/test/mp_unit/switch_channel_tests.cu b/test/mp_unit/switch_channel_tests.cu index a12919e3..6d913c64 100644 --- a/test/mp_unit/switch_channel_tests.cu +++ b/test/mp_unit/switch_channel_tests.cu @@ -1,5 +1,5 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include #include @@ -10,10 +10,10 @@ void SwitchChannelTest::SetUp() { // Need at least two ranks within a node if (gEnv->nRanksPerNode < 2) { - GTEST_SKIP(); + SKIP_TEST(); } if (!mscclpp::isNvlsSupported()) { - GTEST_SKIP(); + SKIP_TEST(); } // Use only two ranks setNumRanksToUse(2); @@ -23,6 +23,8 @@ void SwitchChannelTest::SetUp() { void SwitchChannelTest::TearDown() { CommunicatorTestBase::TearDown(); } __constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan; +__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1; +__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2; __global__ void kernelSwitchReduce() { #if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) @@ -31,7 +33,16 @@ __global__ void kernelSwitchReduce() { #endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) } -TEST_F(SwitchChannelTest, SimpleAllReduce) { +__global__ void kernelSwitchReduceTwo() { +#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) + auto val1 = gConstSwitchChan1.reduce(0); + gConstSwitchChan1.broadcast(0, val1); + auto val2 = gConstSwitchChan2.reduce(0); + gConstSwitchChan2.broadcast(0, val2); +#endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) +} + +TEST(SwitchChannelTest, SimpleAllReduce) { if (gEnv->rank >= numRanksToUse) return; std::vector ranks; @@ -66,22 +77,13 @@ TEST_F(SwitchChannelTest, SimpleAllReduce) { for (int i = 0; i < numRanksToUse; i++) { expected += i + 1.0f; } - ASSERT_EQ(result, expected) << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank; + if (result != expected) { + std::cerr << "Expected " << expected << " but got " << result << " for rank " << gEnv->rank << std::endl; + } + ASSERT_EQ(result, expected); } -__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan1; -__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan2; - -__global__ void kernelSwitchReduceTwo() { -#if (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) - auto val1 = gConstSwitchChan1.reduce(0); - gConstSwitchChan1.broadcast(0, val1); - auto val2 = gConstSwitchChan2.reduce(0); - gConstSwitchChan2.broadcast(0, val2); -#endif // (CUDA_NVLS_API_AVAILABLE) && (__CUDA_ARCH__ >= 900) -} - -TEST_F(SwitchChannelTest, TwoChannelsSameConnection) { +TEST(SwitchChannelTest, TwoChannelsSameConnection) { if (gEnv->rank >= numRanksToUse) return; std::vector ranks; @@ -97,12 +99,9 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) { MSCCLPP_CUDATHROW(cudaMemcpy(buffer1.data(), &data1, sizeof(data1), cudaMemcpyHostToDevice)); MSCCLPP_CUDATHROW(cudaMemcpy(buffer2.data(), &data2, sizeof(data2), cudaMemcpyHostToDevice)); - // Connection size must be large enough for two granularity-aligned buffers. - // The multicast granularity is typically 2MB, so we need at least 2 * 2MB. const size_t connSize = buffer1.bytes() + buffer2.bytes(); auto nvlsConnection = mscclpp::connectNvlsCollective(communicator, ranks, connSize); - // Bind two separate buffers to the same connection auto switchChannel1 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer1.data()), bufSize); auto switchChannel2 = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer2.data()), bufSize); @@ -132,6 +131,6 @@ TEST_F(SwitchChannelTest, TwoChannelsSameConnection) { expected1 += (i + 1.0f) * 1.0f; expected2 += (i + 1.0f) * 10.0f; } - ASSERT_EQ(result1, expected1) << "Channel1: expected " << expected1 << " but got " << result1; - ASSERT_EQ(result2, expected2) << "Channel2: expected " << expected2 << " but got " << result2; + ASSERT_EQ(result1, expected1); + ASSERT_EQ(result2, expected2); } diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt deleted file mode 100644 index 6a16c034..00000000 --- a/test/perf/CMakeLists.txt +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# Find required packages -find_package(MPI REQUIRED) - -# Note: nlohmann_json::nlohmann_json target is already available from the main project - -# Set up common libraries and includes for tests -set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX) -if(MSCCLPP_USE_IB) - list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES}) -endif() - -set(PERF_TEST_INC_COMMON - PRIVATE ${PROJECT_SOURCE_DIR}/include - SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) - -# Function to add a test executable -function(add_perf_test_executable name sources) - if(MSCCLPP_USE_ROCM) - set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX) - endif() - add_executable(${name} ${sources}) - target_link_libraries(${name} ${PERF_TEST_LIBS_COMMON}) - - # Link nlohmann_json - use the target from main project - target_link_libraries(${name} nlohmann_json::nlohmann_json) - - if(MSCCLPP_USE_IB) - target_compile_definitions(${name} PRIVATE USE_IBVERBS) - endif() - - target_include_directories(${name} ${PERF_TEST_INC_COMMON}) - target_compile_definitions(${name} PRIVATE MSCCLPP_USE_MPI_FOR_TESTS) - - # Set C++ standard - target_compile_features(${name} PRIVATE cxx_std_17) - - set_target_properties(${name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/perf") -endfunction() - -# Add FIFO test -add_perf_test_executable(fifo_test "framework.cc;fifo_test.cu") diff --git a/test/perf/fifo_test.cu b/test/perf/fifo_test.cu deleted file mode 100644 index bb77a106..00000000 --- a/test/perf/fifo_test.cu +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "framework.hpp" - -using namespace mscclpp::test; - -// Constants for timeout and trigger calculation -constexpr uint64_t TIMEOUT_SPINS = 1000000; -constexpr int MIN_TRIGGERS = 1000; -constexpr int MIN_WARMUP_TRIGGERS = 100; -constexpr int TRIGGERS_PER_FIFO_SIZE = 10; -constexpr int WARMUP_TRIGGERS_PER_FIFO_SIZE = 2; - -__constant__ mscclpp::FifoDeviceHandle gFifoDeviceHandle; - -__global__ void kernelFifoPush(size_t numTriggers) { - mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle; - int tid = threadIdx.x + blockIdx.x * blockDim.x; - mscclpp::ProxyTrigger trigger; - for (size_t i = 1; i <= numTriggers; ++i) { - trigger.fst = i; - trigger.snd = tid ^ i; - fifo.push(trigger); - } -} - -__global__ void kernelFifoPushSync(size_t numTriggers) { - mscclpp::FifoDeviceHandle& fifo = gFifoDeviceHandle; - mscclpp::ProxyTrigger trigger; - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (size_t i = 1; i <= numTriggers; ++i) { - trigger.fst = i; - trigger.snd = tid ^ i; - fifo.sync(fifo.push(trigger)); - } -} - -static void setupCuda(int& cudaDevice, int& numaNode) { - utils::CUDA_CHECK(cudaGetDevice(&cudaDevice)); - numaNode = mscclpp::getDeviceNumaNode(cudaDevice); - mscclpp::numaBind(numaNode); -} - -// Helper function to consume triggers from FIFO -static bool consumeTriggers(std::unique_ptr& hostFifo, int numTriggers, int parallel) { - int totalTriggers = numTriggers * parallel; - std::unordered_map triggerCounts; - for (int i = 0; i < totalTriggers; ++i) { - mscclpp::ProxyTrigger trigger; - uint64_t spin = 0; - do { - trigger = hostFifo->poll(); - if (spin++ > TIMEOUT_SPINS) { - return false; - } - } while (trigger.fst == 0 || trigger.snd == 0); - - // Process trigger (see src/proxy.cc) - trigger.snd ^= ((uint64_t)1 << (uint64_t)63); - trigger.snd = trigger.snd ^ trigger.fst; - assert(triggerCounts[trigger.snd] + 1 == trigger.fst); - triggerCounts[trigger.snd]++; - hostFifo->pop(); - } - return true; -} - -// Helper function to run a single kernel variant and return performance metrics -std::tuple runSingleKernelVariant(void (*kernel)(size_t), - std::unique_ptr& hostFifo, - cudaStream_t stream, int numParallel) { - // Calculate triggers based on FIFO size - const int numTriggers = std::max(MIN_TRIGGERS, static_cast(hostFifo->size() * TRIGGERS_PER_FIFO_SIZE)); - const int warmupTriggers = - std::max(MIN_WARMUP_TRIGGERS, static_cast(hostFifo->size() * WARMUP_TRIGGERS_PER_FIFO_SIZE)); - - // Warmup - kernel<<>>(warmupTriggers); - utils::CUDA_CHECK(cudaGetLastError()); - - // Process warmup triggers (note: total triggers = warmupTriggers * numParallel) - if (!consumeTriggers(hostFifo, warmupTriggers, numParallel)) { - return {0.0, 0.0, 0, 0}; // Return error values - } - utils::CUDA_CHECK(cudaStreamSynchronize(stream)); - - // Benchmark - utils::Timer timer; - timer.start(); - - kernel<<>>(numTriggers); - utils::CUDA_CHECK(cudaGetLastError()); - - // Process all triggers - if (!consumeTriggers(hostFifo, numTriggers, numParallel)) { - return {0.0, 0.0, 0, 0}; - } - utils::CUDA_CHECK(cudaStreamSynchronize(stream)); - - timer.stop(); - - const int totalTriggers = numTriggers * numParallel; - double throughput = totalTriggers / timer.elapsedSeconds(); - double duration_us = timer.elapsedMicroseconds(); - - utils::CUDA_CHECK(cudaDeviceSynchronize()); - - return {throughput, duration_us, totalTriggers, warmupTriggers * numParallel}; -} - -void runFifoTestVariant(std::unique_ptr& hostFifo, cudaStream_t stream, int numParallel, - nlohmann::ordered_json& combinedMetrics) { - auto [pushThroughput, pushDuration, numTriggers, warmupTriggers] = - runSingleKernelVariant(kernelFifoPush, hostFifo, stream, numParallel); - - auto [syncThroughput, syncDuration, syncNumTriggers, syncWarmupTriggers] = - runSingleKernelVariant(kernelFifoPushSync, hostFifo, stream, numParallel); - - auto formatThroughput = [](double thru) { - return double(int(thru * 10)) / 10.0; // Round to 1 decimal place - }; - - std::string prefix = "p" + std::to_string(numParallel) + "_"; - combinedMetrics[prefix + "push_throughput"] = formatThroughput(pushThroughput); - combinedMetrics[prefix + "push_sync_throughput"] = formatThroughput(syncThroughput); - combinedMetrics[prefix + "push_duration_us"] = pushDuration; - combinedMetrics[prefix + "push_sync_duration_us"] = syncDuration; - combinedMetrics[prefix + "num_triggers"] = numTriggers; - combinedMetrics[prefix + "warmup_triggers"] = warmupTriggers; -} - -struct FifoTestConfig { - int fifoSize; - std::vector parallelismLevels; - - // Constructor with default parallelism levels - FifoTestConfig(int size, const std::vector& parallel = {1, 2, 4, 8, 16}) - : fifoSize(size), parallelismLevels(parallel) {} -}; - -void runFifoTest(const FifoTestConfig& config, [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, - [[maybe_unused]] int localRank) { - if (config.fifoSize <= 0) { - throw std::invalid_argument("FIFO size must be positive"); - } - if (config.parallelismLevels.empty()) { - throw std::invalid_argument("At least one parallelism level must be specified"); - } - - int cudaDevice, numaNode; - setupCuda(cudaDevice, numaNode); - - auto hostFifo = std::make_unique(config.fifoSize); - - mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle(); - utils::CUDA_CHECK(cudaMemcpyToSymbol(gFifoDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle))); - - cudaStream_t stream; - utils::CUDA_CHECK(cudaStreamCreate(&stream)); - - // Create test name with parallelism range - std::string testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_Parallel"; - - // Add parallelism range to test name (e.g., "P1-16" or "P1-4-16-64") - if (!config.parallelismLevels.empty()) { - testName += std::to_string(config.parallelismLevels.front()); - if (config.parallelismLevels.size() > 1) { - testName += "-" + std::to_string(config.parallelismLevels.back()); - - // If parallelism levels have non-standard steps, include more detail - if (config.parallelismLevels.size() > 2 && - (config.parallelismLevels[1] != 2 * config.parallelismLevels[0] || config.parallelismLevels.size() > 3)) { - testName = "FifoTest_Size" + std::to_string(config.fifoSize) + "_ParallelCustom"; - } - } - } - - // Print test configuration - if (utils::isMainRank()) { - std::stringstream ss; - ss << "Running FIFO test with size=" << config.fifoSize << ", parallelism_levels=["; - for (size_t i = 0; i < config.parallelismLevels.size(); ++i) { - if (i > 0) ss << ","; - ss << config.parallelismLevels[i]; - } - ss << "]"; - std::cout << ss.str() << std::endl; - } - - nlohmann::ordered_json combinedMetrics; - - for (int numParallel : config.parallelismLevels) { - runFifoTestVariant(hostFifo, stream, numParallel, combinedMetrics); - } - - std::map testParams; - testParams["fifo_size"] = std::to_string(static_cast(hostFifo->size())); - - // Add parallelism levels to test parameters - std::stringstream parallelismStream; - for (size_t i = 0; i < config.parallelismLevels.size(); ++i) { - if (i > 0) parallelismStream << ","; - parallelismStream << config.parallelismLevels[i]; - } - testParams["parallelism_levels"] = parallelismStream.str(); - - utils::recordResult(testName, "fifo", combinedMetrics, testParams); - - utils::CUDA_CHECK(cudaStreamDestroy(stream)); -} - -void runAllFifoTests([[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] int localRank) { - // clang-format off - std::vector configs = { - {1, {1}}, - {128, {1, 8, 64, 128}}, - {512, {1, 8, 64, 256, 512}}, - }; - // clang-format on - - for (const auto& config : configs) { - runFifoTest(config, rank, worldSize, localRank); - } -} - -static void printUsage(char* argv0) { - std::stringstream ss; - ss << "Usage: " << argv0 << " [OPTIONS]\n" - << "\n" - << "Options:\n" - << " -o, --output-format FORMAT Output format: human or json (default: human)\n" - << " -f, --output-file FILE JSON output file path (default: report.jsonl)\n" - << " -v, --verbose Increase verbosity\n" - << " -h, --help Show this help message\n"; - std::cout << ss.str(); -} - -int main(int argc, char* argv[]) { - std::string outputFormat = "human"; - std::string outputFile = "report.jsonl"; - bool verbose = false; - - static struct option longOptions[] = {{"output-format", required_argument, 0, 'o'}, - {"output-file", required_argument, 0, 'f'}, - {"verbose", no_argument, 0, 'v'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0}}; - - int c; - while ((c = getopt_long(argc, argv, "o:f:vh", longOptions, nullptr)) != -1) { - switch (c) { - case 'o': - outputFormat = optarg; - break; - case 'f': - outputFile = optarg; - break; - case 'v': - verbose = true; - break; - case 'h': - printUsage(argv[0]); - return 0; - default: - printUsage(argv[0]); - return 1; - } - } - - std::vector>> tests = { - {"AllFifoTests", "FIFO performance tests with multiple configurations", runAllFifoTests}}; - - int result = utils::runMultipleTests(argc, argv, tests); - - if (utils::isMainRank()) { - if (outputFormat == "json") { - utils::writeResultsToFile(outputFile); - } else { - utils::printResults(verbose); - } - } - - utils::cleanupMPI(); - - return result; -} diff --git a/test/perf/framework.cc b/test/perf/framework.cc deleted file mode 100644 index 85f7abd8..00000000 --- a/test/perf/framework.cc +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "framework.hpp" - -#include -#include -#include -#include - -namespace mscclpp { -namespace test { - -// Global state for results -static std::vector g_results; -static int g_mpi_rank = 0; -static int g_mpi_size = 1; -static bool g_mpi_initialized = false; - -namespace utils { - -// Internal MPI helper functions (not exposed in header) -void initializeMPI(int argc, char* argv[]) { - if (g_mpi_initialized) return; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank); - MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size); - g_mpi_initialized = true; -} - -static void finalizeMPI() { - if (!g_mpi_initialized) return; - - MPI_Finalize(); - g_mpi_initialized = false; -} - -static int getMPIRank() { return g_mpi_rank; } - -static int getMPISize() { return g_mpi_size; } - -static bool isMainProcess() { return g_mpi_rank == 0; } - -// Public utility functions for test output -bool isMainRank() { return g_mpi_rank == 0; } - -void cleanupMPI() { finalizeMPI(); } - -std::string getCurrentTimestamp() { - auto now = std::chrono::system_clock::now(); - auto time_t = std::chrono::system_clock::to_time_t(now); - std::stringstream ss; - ss << std::put_time(std::gmtime(&time_t), "%Y-%m-%dT%H:%M:%S"); - return ss.str(); -} - -void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics, - const std::map& test_params) { - TestResult result; - result.test_name = test_name; - result.test_category = test_category; - result.test_params = test_params; - result.metrics = metrics; - result.num_processes = g_mpi_size; - result.process_rank = g_mpi_rank; - result.timestamp = getCurrentTimestamp(); - - g_results.push_back(result); -} - -void writeResultsToFile(const std::string& filename) { - std::ofstream file(filename); - if (!file) { - throw std::runtime_error("Cannot open output file: " + filename); - } - - for (const auto& result : g_results) { - nlohmann::ordered_json j; - j["test_name"] = result.test_name; - j["test_category"] = result.test_category; - j["test_config"] = result.test_params; - j["metrics"] = result.metrics; - j["num_processes"] = result.num_processes; - j["process_rank"] = result.process_rank; - j["timestamp"] = result.timestamp; - - file << j.dump() << std::endl; - } -} - -void printResults(bool verbose) { - if (!isMainProcess()) return; - - std::cout << "\n=== Test Results ===" << std::endl; - - for (const auto& result : g_results) { - std::cout << "\nTest: " << result.test_name << " (" << result.test_category << ")" << std::endl; - - if (verbose && !result.test_params.empty()) { - std::cout << " Parameters:" << std::endl; - for (const auto& param : result.test_params) { - std::cout << " " << param.first << ": " << param.second << std::endl; - } - } - - std::cout << " Metrics:" << std::endl; - for (auto it = result.metrics.begin(); it != result.metrics.end(); ++it) { - std::cout << " " << it.key() << ": " << it.value() << std::endl; - } - } - std::cout << std::endl; -} - -// Timer implementation -Timer::Timer() : is_running_(false) {} - -void Timer::start() { - start_time_ = std::chrono::high_resolution_clock::now(); - is_running_ = true; -} - -void Timer::stop() { - end_time_ = std::chrono::high_resolution_clock::now(); - is_running_ = false; -} - -double Timer::elapsedMicroseconds() const { - if (is_running_) { - auto now = std::chrono::high_resolution_clock::now(); - return std::chrono::duration_cast(now - start_time_).count(); - } - return std::chrono::duration_cast(end_time_ - start_time_).count(); -} - -double Timer::elapsedMilliseconds() const { return elapsedMicroseconds() / 1000.0; } - -double Timer::elapsedSeconds() const { return elapsedMicroseconds() / 1000000.0; } - -void cudaCheck(cudaError_t err, const char* file, int line) { - if (err != cudaSuccess) { - std::string msg = - std::string("CUDA error at ") + file + ":" + std::to_string(line) + " - " + cudaGetErrorString(err); - throw std::runtime_error(msg); - } -} - -int runMultipleTests( - int argc, char* argv[], - const std::vector>>& tests) { - int totalResult = 0; - - // Initialize MPI once for all tests - initializeMPI(argc, argv); - - try { - // Get MPI information - int rank = getMPIRank(); - int size = getMPISize(); - int local_rank = rank; // For simplicity, assume local_rank = rank - - for (const auto& test : tests) { - const std::string& testName = std::get<0>(test); - const std::string& testDescription = std::get<1>(test); - const std::function& testFunction = std::get<2>(test); - - if (rank == 0) { - std::cout << "Running test: " << testName << std::endl; - if (!testDescription.empty()) { - std::cout << " " << testDescription << std::endl; - } - } - - // Don't clear results - accumulate them for all tests in the same file - // g_results.clear(); // Commented out to accumulate results - - try { - // Run the individual test function with MPI information - testFunction(rank, size, local_rank); - - // Synchronize before moving to next test - MPI_Barrier(MPI_COMM_WORLD); - - } catch (const std::exception& e) { - if (rank == 0) { - std::cerr << "Error in test " << testName << ": " << e.what() << std::endl; - } - totalResult = 1; - } - } - - // Don't cleanup MPI here - let the caller handle it - // finalizeMPI(); - - } catch (const std::exception& e) { - if (g_mpi_rank == 0) { - std::cerr << "Error: " << e.what() << std::endl; - } - finalizeMPI(); - return 1; - } - - return totalResult; -} - -} // namespace utils -} // namespace test -} // namespace mscclpp diff --git a/test/perf/framework.hpp b/test/perf/framework.hpp deleted file mode 100644 index e9b8c31f..00000000 --- a/test/perf/framework.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef MSCCLPP_TEST_PERF_FRAMEWORK_HPP_ -#define MSCCLPP_TEST_PERF_FRAMEWORK_HPP_ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace mscclpp { -namespace test { - -// Test result structure -struct TestResult { - std::string test_name; - std::string test_category; - std::map test_params; - nlohmann::ordered_json metrics; - int num_processes; - int process_rank; - std::string timestamp; -}; - -// Simple utility functions for testing -namespace utils { - -// Test execution utilities -int runMultipleTests( - int argc, char* argv[], - const std::vector>>& tests); - -// MPI management -void initializeMPI(int argc, char* argv[]); -void cleanupMPI(); -bool isMainRank(); - -// Result recording -void recordResult(const std::string& test_name, const std::string& test_category, const nlohmann::ordered_json& metrics, - const std::map& test_params = {}); - -// Output utilities -void writeResultsToFile(const std::string& filename); -void printResults(bool verbose = false); -void cleanupMPI(); - -// Timing utilities -class Timer { - public: - Timer(); - void start(); - void stop(); - double elapsedMicroseconds() const; - double elapsedMilliseconds() const; - double elapsedSeconds() const; - - private: - std::chrono::high_resolution_clock::time_point start_time_; - std::chrono::high_resolution_clock::time_point end_time_; - bool is_running_; -}; - -// CUDA utilities -void cudaCheck(cudaError_t err, const char* file, int line); -#define CUDA_CHECK(call) cudaCheck(call, __FILE__, __LINE__) - -} // namespace utils - -} // namespace test -} // namespace mscclpp - -#endif // MSCCLPP_TEST_PERF_FRAMEWORK_HPP_ diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 312d31ef..7836e063 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -1,11 +1,13 @@ # Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. +# Licensed under the MIT License. target_sources(unit_tests PRIVATE + unit_tests_main.cc core_tests.cc gpu_utils_tests.cc errors_tests.cc fifo_tests.cu + fifo_perf_tests.cu numa_tests.cc socket_tests.cc utils_tests.cc diff --git a/test/unit/compile_tests.cu b/test/unit/compile_tests.cu index 9db91a4f..893bb940 100644 --- a/test/unit/compile_tests.cu +++ b/test/unit/compile_tests.cu @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. -#include +#include "../framework.hpp" #undef NDEBUG #ifndef DEBUG_BUILD diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc index 32e6a1b5..d2552ff3 100644 --- a/test/unit/core_tests.cc +++ b/test/unit/core_tests.cc @@ -1,12 +1,14 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include +// Licensed under the MIT License. #include -class LocalCommunicatorTest : public ::testing::Test { +#include "../framework.hpp" + +// TODO: TransportFlags needs operator<< for EXPECT_EQ to work +// Using ASSERT_TRUE with manual comparisons as workaround + +class LocalCommunicatorTest : public ::mscclpp::test::TestCase { protected: void SetUp() override { bootstrap = std::make_shared(0, 1); @@ -18,15 +20,15 @@ class LocalCommunicatorTest : public ::testing::Test { std::shared_ptr comm; }; -TEST_F(LocalCommunicatorTest, RegisterMemory) { +TEST(LocalCommunicatorTest, RegisterMemory) { int dummy[42]; auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); EXPECT_EQ(memory.data(), &dummy); EXPECT_EQ(memory.size(), sizeof(dummy)); - EXPECT_EQ(memory.transports(), mscclpp::NoTransports); + ASSERT_TRUE(memory.transports() == mscclpp::NoTransports); } -TEST_F(LocalCommunicatorTest, SendMemoryToSelf) { +TEST(LocalCommunicatorTest, SendMemoryToSelf) { int dummy[42]; auto memory = comm->registerMemory(&dummy, sizeof(dummy), mscclpp::NoTransports); comm->sendMemory(memory, 0); @@ -34,5 +36,5 @@ TEST_F(LocalCommunicatorTest, SendMemoryToSelf) { auto sameMemory = memoryFuture.get(); EXPECT_EQ(sameMemory.data(), memory.data()); EXPECT_EQ(sameMemory.size(), memory.size()); - EXPECT_EQ(sameMemory.transports(), memory.transports()); + ASSERT_TRUE(sameMemory.transports() == memory.transports()); } diff --git a/test/unit/errors_tests.cc b/test/unit/errors_tests.cc index f9faad19..3eeed387 100644 --- a/test/unit/errors_tests.cc +++ b/test/unit/errors_tests.cc @@ -1,30 +1,33 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include +#include "../framework.hpp" + +// TODO: ErrorCode needs operator<< for EXPECT_EQ to work +// Using ASSERT_TRUE with manual comparisons as workaround + TEST(ErrorsTest, SystemError) { mscclpp::Error error("test", mscclpp::ErrorCode::SystemError); - EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::SystemError); + ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::SystemError); EXPECT_EQ(error.what(), std::string("test (mscclpp failure: SystemError)")); } TEST(ErrorsTest, InternalError) { mscclpp::Error error("test", mscclpp::ErrorCode::InternalError); - EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InternalError); + ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InternalError); EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InternalError)")); } TEST(ErrorsTest, InvalidUsage) { mscclpp::Error error("test", mscclpp::ErrorCode::InvalidUsage); - EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::InvalidUsage); + ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::InvalidUsage); EXPECT_EQ(error.what(), std::string("test (mscclpp failure: InvalidUsage)")); } TEST(ErrorsTest, Timeout) { mscclpp::Error error("test", mscclpp::ErrorCode::Timeout); - EXPECT_EQ(error.getErrorCode(), mscclpp::ErrorCode::Timeout); + ASSERT_TRUE(error.getErrorCode() == mscclpp::ErrorCode::Timeout); EXPECT_EQ(error.what(), std::string("test (mscclpp failure: Timeout)")); } diff --git a/test/unit/fifo_perf_tests.cu b/test/unit/fifo_perf_tests.cu new file mode 100644 index 00000000..34b5d6bc --- /dev/null +++ b/test/unit/fifo_perf_tests.cu @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +#include "../framework.hpp" + +// Simple FIFO performance test to be run as part of unit_tests +// This is a performance test that can be excluded from coverage runs +// using the --exclude-perf-tests flag. + +constexpr uint64_t TIMEOUT_SPINS = 1000000; +constexpr int MIN_TRIGGERS = 100; // Reduced for faster unit test execution + +__constant__ mscclpp::FifoDeviceHandle gFifoPerfDeviceHandle; + +__global__ void kernelFifoPerfPush(size_t numTriggers) { + mscclpp::FifoDeviceHandle& fifo = gFifoPerfDeviceHandle; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + mscclpp::ProxyTrigger trigger; + for (size_t i = 1; i <= numTriggers; ++i) { + trigger.fst = i; + trigger.snd = tid ^ i; + fifo.push(trigger); + } +} + +static bool consumePerfTriggers(std::unique_ptr& hostFifo, int numTriggers, int parallel) { + int totalTriggers = numTriggers * parallel; + std::unordered_map triggerCounts; + for (int i = 0; i < totalTriggers; ++i) { + mscclpp::ProxyTrigger trigger; + uint64_t spin = 0; + do { + trigger = hostFifo->poll(); + if (spin++ > TIMEOUT_SPINS) { + return false; + } + } while (trigger.fst == 0 || trigger.snd == 0); + + trigger.snd ^= ((uint64_t)1 << (uint64_t)63); + trigger.snd = trigger.snd ^ trigger.fst; + if (triggerCounts[trigger.snd] + 1 != trigger.fst) { + return false; // Validation failed + } + triggerCounts[trigger.snd]++; + hostFifo->pop(); + } + return true; +} + +PERF_TEST(FifoPerfTest, BasicPerformance) { + int cudaDevice, numaNode; + CUDA_CHECK(cudaGetDevice(&cudaDevice)); + numaNode = mscclpp::getDeviceNumaNode(cudaDevice); + mscclpp::numaBind(numaNode); + + const int fifoSize = 128; + const int numTriggers = MIN_TRIGGERS; + const int numParallel = 1; + + auto hostFifo = std::make_unique(fifoSize); + mscclpp::FifoDeviceHandle hostHandle = hostFifo->deviceHandle(); + CUDA_CHECK(cudaMemcpyToSymbol(gFifoPerfDeviceHandle, &hostHandle, sizeof(mscclpp::FifoDeviceHandle))); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + // Run kernel + kernelFifoPerfPush<<>>(numTriggers); + CUDA_CHECK(cudaGetLastError()); + + // Process triggers + bool success = consumePerfTriggers(hostFifo, numTriggers, numParallel); + ASSERT_TRUE(success); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaDeviceSynchronize()); +} diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu index b67a220d..8d30ca5e 100644 --- a/test/unit/fifo_tests.cu +++ b/test/unit/fifo_tests.cu @@ -1,13 +1,12 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include #include #include #include +#include "../framework.hpp" #include "utils_internal.hpp" #define ITER 10000 // should be larger than the FIFO size for proper testing diff --git a/test/unit/gpu_utils_tests.cc b/test/unit/gpu_utils_tests.cc index f4aba0d7..977314e9 100644 --- a/test/unit/gpu_utils_tests.cc +++ b/test/unit/gpu_utils_tests.cc @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include +#include "../framework.hpp" + TEST(GpuUtilsTest, StreamPool) { auto streamPool = mscclpp::gpuStreamPool(); cudaStream_t s; diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu index 50ffc9ea..699baa38 100644 --- a/test/unit/local_channel_tests.cu +++ b/test/unit/local_channel_tests.cu @@ -1,13 +1,13 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include #include #include #include +#include "../framework.hpp" + #define MAGIC_CONST 777 __constant__ mscclpp::PortChannelDeviceHandle gPortChannel; diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc index dfa63a74..46bf5e18 100644 --- a/test/unit/numa_tests.cc +++ b/test/unit/numa_tests.cc @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include #include +#include "../framework.hpp" + TEST(NumaTest, Basic) { int num; MSCCLPP_CUDATHROW(cudaGetDeviceCount(&num)); diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc index 1ab592ba..a5598938 100644 --- a/test/unit/socket_tests.cc +++ b/test/unit/socket_tests.cc @@ -1,11 +1,10 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include #include +#include "../framework.hpp" #include "socket.h" #include "utils_internal.hpp" diff --git a/test/unit/unit_tests_main.cc b/test/unit/unit_tests_main.cc new file mode 100644 index 00000000..397566e0 --- /dev/null +++ b/test/unit/unit_tests_main.cc @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "../framework.hpp" + +int main(int argc, char** argv) { return RUN_ALL_TESTS(); } diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc index 5479a681..8526d9fe 100644 --- a/test/unit/utils_internal_tests.cc +++ b/test/unit/utils_internal_tests.cc @@ -1,10 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#include - #include +#include "../framework.hpp" #include "utils_internal.hpp" TEST(UtilsInternalTest, getHostHash) { diff --git a/test/unit/utils_tests.cc b/test/unit/utils_tests.cc index fa079b30..51562c21 100644 --- a/test/unit/utils_tests.cc +++ b/test/unit/utils_tests.cc @@ -1,12 +1,12 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include +// Licensed under the MIT License. #include #include #include +#include "../framework.hpp" + TEST(UtilsTest, getHostName) { std::string hostname1 = mscclpp::getHostName(1024, '.'); EXPECT_FALSE(hostname1.empty()); From fd76507e9a6b57a6de7ad832deb4a15dc4d60195 Mon Sep 17 00:00:00 2001 From: Ekow Wellington <34079588+ekwhoa@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:27:33 -0500 Subject: [PATCH 03/21] Install default plans under MSCCLPP_CACHE_DIR/default (#769) ### Summary Update the installer to place bundled default execution plans under `/default`, which is where the runtime already looks for bundled plans. ### Background The C++ runtime treats `MSCCLPP_CACHE_DIR` as the cache *root* and loads bundled default plans from `/default`. When `MSCCLPP_CACHE_DIR` was set, the installer instead wrote bundled plans directly into the cache root, causing the runtime to miss them. This surfaced while running benchmarking tests with a non-default `MSCCLPP_CACHE_DIR`, where the bundled plans were not being discovered. ### Change This PR updates the installer to always install bundled default plans into `/default`, preserving the existing runtime contract. ### Scope - Installer-only change - No runtime behavior changes ### Validation Manual inspection of the updated install path. Successful build --------- Co-authored-by: Ekow Wellington --- docs/dsl/quick_start.md | 4 ++++ docs/dsl/results.md | 3 +++ python/mscclpp/__main__.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/dsl/quick_start.md b/docs/dsl/quick_start.md index 6c32ec32..afccd48e 100644 --- a/docs/dsl/quick_start.md +++ b/docs/dsl/quick_start.md @@ -12,6 +12,10 @@ After finishing the installation in the quick start section, you can add the fol python3 -m mscclpp --install ``` +This installs bundled default execution plans into `~/.cache/mscclpp/default` by default. +If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed into `MSCCLPP_CACHE_DIR/default`. +`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path. + ## Your First Algorithm: AllGather Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features. diff --git a/docs/dsl/results.md b/docs/dsl/results.md index 99f19476..a1adad2a 100644 --- a/docs/dsl/results.md +++ b/docs/dsl/results.md @@ -59,6 +59,9 @@ After installation, the generated JSON execution plan can be found at: ~/.cache/mscclpp/default/ ``` +If `MSCCLPP_CACHE_DIR` is set, bundled default plans are installed under `MSCCLPP_CACHE_DIR/default/`. +`MSCCLPP_CACHE_DIR` specifies the cache root directory, so it should be set without `default` in the path. + **Performance Results:** The figure below shows the performance characteristics for small message sizes in a two-node configuration: diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py index d57cb362..6a6f5f28 100644 --- a/python/mscclpp/__main__.py +++ b/python/mscclpp/__main__.py @@ -57,7 +57,7 @@ default_algo_configs = [ def create_default_plans(): - plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default") + plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default") plan_path = Path(plan_dir) if plan_path.exists(): shutil.rmtree(plan_path) From 4f3638b60db4640eb5f0cd4c1c92e05a72227474 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 31 Mar 2026 15:34:43 -0700 Subject: [PATCH 04/21] Use PTX red for D2D semaphore signal (#768) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Replace the two-step `signal()` implementation (`incOutbound()` + `atomicStore()`) with a single fire-and-forget PTX `red.release.sys.global.add.u64` instruction - This eliminates one local atomic fetch-add and replaces a remote store with a remote atomic add that has no return value — more efficient on both NVIDIA (PTX `red`) and AMD (compiler optimizes `(void)fetch_add` to fire-and-forget `flat_atomic_add_x2`) - Add a C++ perf test (`PERF_TEST`) in `mp_unit` for signal+wait ping-pong latency ### Performance (H100, 2 ranks, signal+wait round-trip) ``` SemaphorePerfTest.SignalPingPong: Store-based (old): 2.595 us/iter Red-based (new): 2.345 us/iter Speedup: 1.11x ``` ## Test plan - [x] Builds successfully (`make mp_unit_tests`) - [x] `mpirun -np 2 ./build/bin/mp_unit_tests --filter "SemaphorePerfTest"` — 1.11x speedup 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 --- include/mscclpp/semaphore.hpp | 1 - include/mscclpp/semaphore_device.hpp | 34 ++++--------- python/csrc/semaphore_py.cpp | 1 - src/core/semaphore.cc | 5 +- test/mp_unit/CMakeLists.txt | 1 + test/mp_unit/mp_unit_tests.hpp | 6 +++ test/mp_unit/semaphore_perf_tests.cu | 73 ++++++++++++++++++++++++++++ 7 files changed, 91 insertions(+), 30 deletions(-) create mode 100644 test/mp_unit/semaphore_perf_tests.cu diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp index 27f9aefa..85787c95 100644 --- a/include/mscclpp/semaphore.hpp +++ b/include/mscclpp/semaphore.hpp @@ -82,7 +82,6 @@ class MemoryDevice2DeviceSemaphore { private: Semaphore semaphore_; detail::UniqueGpuPtr expectedInboundToken_; - detail::UniqueGpuPtr outboundToken_; public: /// Constructor. diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index f1b01e89..a790a6e1 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -82,19 +82,20 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { /// Signal remote device, ensures prior memory ops complete. MSCCLPP_DEVICE_INLINE void signal() { - auto outbound = incOutbound(); -#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ == 800) - // Using memoryOrderSeqCst is faster for A100. - atomicStore(remoteInboundToken, outbound, memoryOrderSeqCst); -#else - atomicStore(remoteInboundToken, outbound, memoryOrderRelease); +#if defined(MSCCLPP_DEVICE_CUDA) + asm volatile("red.release.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory"); +#elif defined(MSCCLPP_DEVICE_HIP) + (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelease); #endif } /// Relaxed signal; no memory completion guarantee. Use it only for synchronizing execution, not data. MSCCLPP_DEVICE_INLINE void relaxedSignal() { - auto outbound = incOutbound(); - atomicStore(remoteInboundToken, outbound, memoryOrderRelaxed); +#if defined(MSCCLPP_DEVICE_CUDA) + asm volatile("red.relaxed.sys.global.add.u64 [%0], %1;" ::"l"(remoteInboundToken), "l"((uint64_t)1) : "memory"); +#elif defined(MSCCLPP_DEVICE_HIP) + (void)atomicFetchAdd(remoteInboundToken, (uint64_t)1, memoryOrderRelaxed); +#endif } /// Thread-safe read of expected inbound value. @@ -121,27 +122,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { return atomicLoad(inboundToken, memoryOrderRelaxed); } - /// Thread-safe read of outbound value. - /// @return The outbound value. - MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() { - return atomicLoad(outboundToken, memoryOrderRelaxed); - } - - /// Thread-safe increment of outbound value. - /// @return The incremented outbound value. - MSCCLPP_DEVICE_INLINE uint64_t incOutbound() { - return atomicFetchAdd(outboundToken, 1, memoryOrderRelaxed) + 1; - } #endif // defined(MSCCLPP_DEVICE_COMPILE) /// A local memory space where the remote device will write its semaphore value and the local device will read it. uint64_t* inboundToken; - /// A local memory space where the local device stores the semaphore value to be written to the remote device. - uint64_t* outboundToken; - - /// A remote memory space where the local device writes its outboundToken on. This is inboundToken of the - /// remote device. + /// A remote memory space where the local device atomically increments. This is inboundToken of the remote device. uint64_t* remoteInboundToken; /// A local memory space where the local device stores the expected value of the inboundToken to wait for. diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp index 36d559f2..17c06a7d 100644 --- a/python/csrc/semaphore_py.cpp +++ b/python/csrc/semaphore_py.cpp @@ -43,7 +43,6 @@ void register_semaphore(nb::module_& m) { nb::class_(memoryDevice2DeviceSemaphore, "DeviceHandle") .def(nb::init<>()) .def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken) - .def_rw("outbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundToken) .def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken) .def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken) .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc index c6eb1e23..bea43327 100644 --- a/src/core/semaphore.cc +++ b/src/core/semaphore.cc @@ -183,9 +183,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) { } MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(const Semaphore& semaphore) - : semaphore_(semaphore), - expectedInboundToken_(detail::gpuCallocUnique()), - outboundToken_(detail::gpuCallocUnique()) { + : semaphore_(semaphore), expectedInboundToken_(detail::gpuCallocUnique()) { if (connection().localDevice().type != DeviceType::GPU) { throw Error("Local endpoint device type of MemoryDevice2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage); } @@ -202,7 +200,6 @@ MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSe device.remoteInboundToken = reinterpret_cast(semaphore_.remoteMemory().data()); device.inboundToken = reinterpret_cast(semaphore_.localMemory().data()); device.expectedInboundToken = expectedInboundToken_.get(); - device.outboundToken = outboundToken_.get(); return device; }; diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt index b99bb09d..d4004e8e 100644 --- a/test/mp_unit/CMakeLists.txt +++ b/test/mp_unit/CMakeLists.txt @@ -8,6 +8,7 @@ target_sources(mp_unit_tests PRIVATE communicator_tests.cu port_channel_tests.cu memory_channel_tests.cu + semaphore_perf_tests.cu switch_channel_tests.cu executor_tests.cc ) diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index 03e4cbde..5f95d660 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -176,6 +176,12 @@ class MemoryChannelOneToOneTest : public CommunicatorTestBase { std::unordered_map> memorySemaphores; }; +class SemaphorePerfTest : public CommunicatorTestBase { + protected: + void SetUp() override; + void TearDown() override; +}; + class SwitchChannelTest : public CommunicatorTestBase { protected: void SetUp() override; diff --git a/test/mp_unit/semaphore_perf_tests.cu b/test/mp_unit/semaphore_perf_tests.cu new file mode 100644 index 00000000..92560539 --- /dev/null +++ b/test/mp_unit/semaphore_perf_tests.cu @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include + +#include "mp_unit_tests.hpp" + +void SemaphorePerfTest::SetUp() { + // Need at least two ranks within a node + if (gEnv->nRanksPerNode < 2) { + SKIP_TEST(); + } + setNumRanksToUse(2); + CommunicatorTestBase::SetUp(); +} + +void SemaphorePerfTest::TearDown() { CommunicatorTestBase::TearDown(); } + +__constant__ mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle gSemaphorePerfTestHandle; + +__global__ void kernelSemaphorePingPong(int rank, int nIters) { + mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle& sem = gSemaphorePerfTestHandle; + + // Warmup + for (int i = 0; i < 10; i++) { + if ((rank ^ (i & 1)) == 0) { + sem.signal(); + } else { + sem.wait(); + } + } + + // Timed iterations — alternating signal/wait like the memory channel ping-pong + for (int i = 0; i < nIters; i++) { + if ((rank ^ (i & 1)) == 0) { + sem.signal(); + } else { + sem.wait(); + } + } +} + +PERF_TEST(SemaphorePerfTest, SignalPingPong) { + if (gEnv->rank >= numRanksToUse) return; + + connectMesh(/*useIpc=*/true, /*useIb=*/false, /*useEthernet=*/false); + + int peerRank = (gEnv->rank == 0) ? 1 : 0; + auto d2dSemaphore = std::make_shared(*communicator, connections[peerRank]); + + auto devHandle = d2dSemaphore->deviceHandle(); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSemaphorePerfTestHandle, &devHandle, sizeof(devHandle))); + + const int nIters = 1000; + const std::string testName = ::mscclpp::test::currentTestName(); + + // Warmup run + kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + communicator->bootstrap()->barrier(); + + // Timed run + mscclpp::Timer timer; + kernelSemaphorePingPong<<<1, 1>>>(gEnv->rank, nIters); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + if (gEnv->rank == 0) { + std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nIters << " us/iter\n"; + } +} From d2f7056cf4d1956cb452ee475b331f8e19e1d886 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 31 Mar 2026 22:30:35 -0700 Subject: [PATCH 05/21] Add unit testing framework readme (#766) --- test/README.md | 130 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 test/README.md diff --git a/test/README.md b/test/README.md new file mode 100644 index 00000000..a69b66ad --- /dev/null +++ b/test/README.md @@ -0,0 +1,130 @@ +# MSCCL++ C++ Test Framework + +A lightweight, GTest-like test framework with MPI support for testing MSCCL++ C++ APIs. Defined in `framework.hpp` / `framework.cc`. + +## Adding a New Test (Step-by-Step) + +### Single-process test (unit/) + +1. **Create the test file** `test/unit/my_feature_tests.cc` (or `.cu` for CUDA): + + ```cpp + #include "../framework.hpp" + #include + + TEST(MyFeatureTest, BasicUsage) { + EXPECT_EQ(myFunction(), 42); + } + ``` + +2. **Register it in CMake** — add the filename to `test/unit/CMakeLists.txt`: + + ```cmake + target_sources(unit_tests PRIVATE + ... + my_feature_tests.cc # <-- add here + ) + ``` + +3. **Build and run**: + + ```bash + cmake --build build -j + ./build/test/unit_tests --filter=MyFeatureTest + ``` + +### Multi-process test (mp_unit/) + +1. **Create the test file** `test/mp_unit/my_feature_tests.cc` (or `.cu`): + + ```cpp + #include "mp_unit_tests.hpp" + + TEST(MyFeatureTest, MultiRank) { + int rank = gEnv->rank; + EXPECT_GE(rank, 0); + } + ``` + + Use fixtures from `mp_unit_tests.hpp` (e.g., `CommunicatorTest`) if you need pre-established connections. + +2. **Register it in CMake** — add the filename to `test/mp_unit/CMakeLists.txt`: + + ```cmake + target_sources(mp_unit_tests PRIVATE + ... + my_feature_tests.cc # <-- add here + ) + ``` + +3. **Build and run**: + + ```bash + cmake --build build -j + mpirun -np 2 ./build/test/mp_unit_tests --filter=MyFeatureTest + ``` + +### Notes + +- No separate test registration step is needed — `TEST()` auto-registers via static initialization. +- The `test_framework` static library is built from `framework.cc` in the top-level `test/CMakeLists.txt` and linked into both `unit_tests` and `mp_unit_tests`. You do not need to modify it. +- Use `.cu` extension for files that contain CUDA kernel code; use `.cc` for host-only tests. +- Each test binary needs a `main()` that calls `RUN_ALL_TESTS()`. See `unit/unit_tests_main.cc` (single-process) and `mp_unit/mp_unit_tests.cc` (multi-process with `Environment` setup). +- Additional run options: `--filter=-Pattern` (exclude), `--exclude-perf-tests` (skip `PERF_TEST`s). + +## Macros + +| Macro | Behavior | +|---|---| +| `TEST(Suite, Name)` | Register a test. If `Suite` is a defined class, it's used as a fixture. | +| `PERF_TEST(Suite, Name)` | Same as `TEST` but marked as perf (skippable via `--exclude-perf-tests`). | +| `EXPECT_*` | Non-fatal assertions: `EXPECT_TRUE`, `EXPECT_FALSE`, `EXPECT_EQ`, `EXPECT_NE`, `EXPECT_LT`, `EXPECT_LE`, `EXPECT_GT`, `EXPECT_GE` | +| `ASSERT_*` | Fatal assertions (abort test on failure): same variants as `EXPECT_*`, plus `ASSERT_NO_THROW` | +| `FAIL()` | Fail immediately. Supports streaming: `FAIL() << "reason";` | +| `SKIP_TEST()` | Skip the current test. Supports streaming: `SKIP_TEST() << "reason";` | +| `CUDA_CHECK(call)` | Check a CUDA API return code, throw on error. | + +## Fixtures + +Define a class inheriting from `mscclpp::test::TestCase` with `SetUp()` / `TearDown()`, then use the class name as the suite name: + +```cpp +class MyFixture : public mscclpp::test::TestCase { + public: + void SetUp() override { /* per-test setup */ } + void TearDown() override { /* per-test cleanup */ } + protected: + int sharedState_ = 0; +}; + +TEST(MyFixture, SomeTest) { + sharedState_ = 42; + EXPECT_EQ(sharedState_, 42); +} +``` + +See `mp_unit/mp_unit_tests.hpp` (`BootstrapTest`, `CommunicatorTest`, etc.) for real fixture examples. + +## Global Environments + +Register an `Environment` subclass for one-time global setup/teardown (e.g., MPI bootstrap): + +```cpp +class MyEnv : public mscclpp::test::Environment { + public: + void SetUp() override { /* global init */ } + void TearDown() override { /* global cleanup */ } +}; + +// In main(), before RUN_ALL_TESTS(): +mscclpp::test::TestRegistry::instance().addEnvironment(new MyEnv()); +``` + +See `mp_unit/mp_unit_tests.cc` for the `MultiProcessTestEnv` example. + +## Utilities + +- `mscclpp::test::utils::isMainRank()` — true on MPI rank 0 +- `mscclpp::test::utils::getMPIRank()` / `getMPISize()` +- `mscclpp::test::utils::Timer` — high-resolution timer with `start()`, `stop()`, `elapsedMilliseconds()` +- `mscclpp::test::currentTestName()` — returns `"Suite.Name"` for the running test \ No newline at end of file From be9126ca1b36c4817de622a0aebd87e5382b9a6b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 1 Apr 2026 16:25:19 -0700 Subject: [PATCH 06/21] Fix run-remote.sh to support multi-command scripts (#770) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Fix `run-remote.sh` to correctly execute multi-command scripts (e.g., multiple `mpirun` calls) - The old approach piped decoded script through `base64 -d | bash`, which feeds the script via bash's **stdin**. When `mpirun` (or its child processes) runs, it can consume the remaining stdin, causing bash to never see subsequent commands — only the first command would execute. - The fix decodes the script to a **temp file** and runs `bash -euxo pipefail "$TMP"` instead, so bash reads commands from the file and `mpirun` consuming stdin has no effect. - Applied to both the docker path (pssh + docker exec) and the non-docker path (pssh only). 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- test/deploy/run-remote.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh index b646ea92..2468243e 100755 --- a/test/deploy/run-remote.sh +++ b/test/deploy/run-remote.sh @@ -97,11 +97,14 @@ if $USE_DOCKER; then INNER+=" cd /root/mscclpp;" INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;" INNER+=" CMD_B64='${CMD_B64}';" - INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail" + INNER+=" TMP=\\\$(mktemp);" + INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d > \\\"\\\$TMP\\\";" + INNER+=" bash -euxo pipefail \\\"\\\$TMP\\\";" + INNER+=" rm -f \\\"\\\$TMP\\\"" parallel-ssh -i "${PSSH_COMMON[@]}" \ "sudo docker exec mscclpp-test bash -c \"${INNER}\"" else parallel-ssh -i "${PSSH_COMMON[@]}" \ - "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail" + "set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\"" fi From fa95e82e18c5f963b059aefe20939d5ca8a63df2 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 7 Apr 2026 08:41:51 -0700 Subject: [PATCH 07/21] Fix CI/CD pipeline issues (#773) This pull request updates the deployment pipeline to allow custom CMake arguments to be passed to the pip install process on remote VMs. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .azure-pipelines/templates/deploy.yml | 24 ++++++++++++++++++++++-- .azure-pipelines/templates/ut-npkit.yml | 10 +++++----- test/deploy/setup.sh | 6 ++++++ tools/npkit/npkit_trace_generator.py | 16 ++++++++-------- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml index fc116acf..2f642f1d 100644 --- a/.azure-pipelines/templates/deploy.yml +++ b/.azure-pipelines/templates/deploy.yml @@ -94,7 +94,27 @@ steps: du -sh build/bin/* 2>/dev/null || true workingDirectory: '$(System.DefaultWorkingDirectory)' -# 2. Download SSH key + install packages + start VMSS +# 2. Write CMake args for pip install on remote VMs +- task: Bash@3 + name: WritePipCmakeArgs + displayName: Write pip CMake args + inputs: + targetType: 'inline' + script: | + set -e + PIP_CMAKE_ARGS="" + if [ -n "${{ parameters.gpuArch }}" ]; then + PIP_CMAKE_ARGS="-DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }}" + fi + CMAKE_EXTRA_ARGS='${{ parameters.cmakeArgs }}' + if [ -n "${CMAKE_EXTRA_ARGS}" ]; then + PIP_CMAKE_ARGS="${PIP_CMAKE_ARGS} ${CMAKE_EXTRA_ARGS}" + fi + echo "${PIP_CMAKE_ARGS}" > pip_cmake_args.txt + echo "pip CMake args: $(cat pip_cmake_args.txt)" + workingDirectory: '$(System.DefaultWorkingDirectory)' + +# 3. Download SSH key + install packages + start VMSS - task: DownloadSecureFile@1 name: SshKeyFile displayName: Download key file @@ -120,7 +140,7 @@ steps: inlineScript: | az vmss start --name ${{ parameters.vmssName }} --resource-group ${{ parameters.resourceGroup }} -# 3. Deploy test environment +# 4. Deploy test environment - task: Bash@3 name: DeployTestEnv displayName: Deploy Test Env diff --git a/.azure-pipelines/templates/ut-npkit.yml b/.azure-pipelines/templates/ut-npkit.yml index e53b5cf5..1bd89caf 100644 --- a/.azure-pipelines/templates/ut-npkit.yml +++ b/.azure-pipelines/templates/ut-npkit.yml @@ -28,7 +28,7 @@ steps: grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json - template: run-remote-task.yml parameters: @@ -42,14 +42,14 @@ steps: grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json - grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_SEND_ENTRY ./npkit_output/npkit_event_trace.json rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json - grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json - grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json - grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json + grep -q NPKIT_EVENT_EXECUTOR_UNPACK_PACKETS_ENTRY ./npkit_output/npkit_event_trace.json - template: stop.yml parameters: diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 80cd10b1..d4996cc2 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -30,6 +30,12 @@ fi if [ "${PLATFORM}" == "rocm" ]; then export CXX=/opt/rocm/bin/hipcc fi + +PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt" +if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then + export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})" + echo "Using CMAKE_ARGS: ${CMAKE_ARGS}" +fi cd /root/mscclpp && pip3 install . pip3 install setuptools_scm python3 -m setuptools_scm --force-write-version-files diff --git a/tools/npkit/npkit_trace_generator.py b/tools/npkit/npkit_trace_generator.py index c5ed6191..294516e6 100644 --- a/tools/npkit/npkit_trace_generator.py +++ b/tools/npkit/npkit_trace_generator.py @@ -14,25 +14,25 @@ def parse_npkit_event_header(npkit_event_header_path): "NOP", "BARRIER", "PUT", - "PUT_PACKET", - "READ_PUT_PACKET", + "PUT_PACKETS", + "READ_PUT_PACKETS", "PUT_WITH_SIGNAL", "PUT_WITH_SIGNAL_AND_FLUSH", "GET", "COPY", - "COPY_PACKET", - "TRANSFORM_TO_PACKET", + "COPY_PACKETS", + "UNPACK_PACKETS", "SIGNAL", "WAIT", "FLUSH", "REDUCE", - "REDUCE_PACKET", + "REDUCE_PACKETS", "REDUCE_COPY_PACKETS", "REDUCE_SEND", - "REDUCE_SEND_PACKET", + "REDUCE_SEND_PACKETS", "REDUCE_COPY_SEND_PACKETS", - "READ_REDUCE_COPY", - "READ_REDUCE_COPY_SEND", + "READ_REDUCE", + "READ_REDUCE_SEND", "MULTI_LOAD_REDUCE_STORE", "RELAXED_SIGNAL", "RELAXED_WAIT", From 96a72bbd3e71df14f8afca6b4daaf907bbad8e8e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 7 Apr 2026 13:37:02 -0700 Subject: [PATCH 08/21] Support E4M3B15 datatype (#765) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - **Add `fp8_e4m3b15` datatype**: A software-defined FP8 type with 4 exponent bits, 3 mantissa bits, and bias=15 (max finite value: 0.9375). Implemented entirely in software with no HW dependency, using Triton-style bit manipulation through fp16 as intermediate for efficient conversion. - **Add mixed-precision accumulation for allreduce**: All allreduce algorithm variants (packet, NVLS packet, fullmesh, RSAG zero-copy, and others) now support a configurable `accumDtype` parameter, enabling FP8 inputs to be reduced in float16 or float32 for higher accuracy. - **Propagate `accumDtype` through the full API**: The new parameter is threaded from `Algorithm::execute()` → `NativeAlgorithm` → `KernelFunc` → dispatch → CUDA kernels, with `DataType::AUTO` as the default (resolves to input dtype at runtime). - **Add FP8 accumulation correctness tests**: New `test_fp8_accum.py` validates that higher-precision accumulation produces results at least as accurate as native FP8 accumulation across multiple algorithms and sizes. Skipped on CUDA SM < 89 (pre-Hopper); runs on HIP/ROCm. - **Add `test_fp8_accum.py` to CI**: Azure Pipeline `ut.yml` now runs FP8 accumulation tests alongside existing pytests. - **NCCL shim logging cleanup**: Migrated `printf`-style `WARN`/`INFO` calls to streaming-style logging. ## Key files | Area | Files | |------|-------| | New datatype + vector ops | `include/mscclpp/gpu_data_types.hpp` | | Accumulation reduce helpers | `src/core/include/reduce_kernel.hpp` | | Algorithm API (`accumDtype`) | `include/mscclpp/algorithm.hpp`, `src/core/algorithm.cc` | | Allreduce kernels | `src/ext/collectives/allreduce/*.cu` | | Dispatch + common | `src/ext/collectives/include/allreduce/common.hpp` | | Python bindings | `python/csrc/algorithm.cpp`, `python/mscclpp/_core/algorithm.py` | | Tests | `python/test/test_fp8_accum.py` | | CI | `.azure-pipelines/templates/ut.yml` | ## Test plan - [x] CI passes on H100 (CUDA SM 90) — full FP8 E4M3 + E4M3B15 accumulation tests - [x] CI passes on A100 (CUDA SM 80) — FP8 tests correctly skipped - [x] CI passes on MI300X (ROCm) — FP8 tests run via HIP - [x] Existing `test_mscclpp.py` tests continue to pass - [x] NCCL shim builds and runs correctly with new `accumDtype` defaults 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .azure-pipelines/templates/ut.yml | 1 + docs/guide/mscclpp-torch-integration.md | 3 +- .../customized_allgather.cu | 3 +- .../torch-integration/customized_allgather.cu | 3 +- include/mscclpp/algorithm.hpp | 15 +- include/mscclpp/gpu_data_types.hpp | 771 +++++++++++++++++- python/csrc/algorithm.cpp | 8 +- python/csrc/core_py.cpp | 3 +- python/csrc/gpu_utils_py.cpp | 13 + python/mscclpp/_core/algorithm.py | 8 +- python/test/test_fp8_accum.py | 391 +++++++++ src/core/algorithm.cc | 17 +- src/core/executor/execution_kernel.cu | 6 + src/core/include/execution_kernel.hpp | 27 +- src/core/include/reduce_kernel.hpp | 174 +++- .../allgather/allgather_fullmesh.cu | 3 +- .../allgather/allgather_fullmesh_2.cu | 3 +- .../allreduce/allreduce_allpair_packet.cu | 13 +- .../allreduce/allreduce_fullmesh.cu | 37 +- .../allreduce_nvls_block_pipeline.cu | 14 +- .../allreduce/allreduce_nvls_packet.cu | 45 +- .../allreduce/allreduce_nvls_warp_pipeline.cu | 19 +- .../allreduce/allreduce_nvls_zero_copy.cu | 15 +- .../collectives/allreduce/allreduce_packet.cu | 68 +- .../collectives/allreduce/allreduce_rsag.cu | 13 +- .../allreduce/allreduce_rsag_pipeline.cu | 19 +- .../allreduce/allreduce_rsag_zero_copy.cu | 31 +- .../allreduce/allreduce_allpair_packet.hpp | 2 +- .../include/allreduce/allreduce_fullmesh.hpp | 2 +- .../allreduce_nvls_block_pipeline.hpp | 2 +- .../allreduce/allreduce_nvls_packet.hpp | 4 +- .../allreduce_nvls_warp_pipeline.hpp | 2 +- .../allreduce/allreduce_nvls_zero_copy.hpp | 2 +- .../include/allreduce/allreduce_packet.hpp | 2 +- .../include/allreduce/allreduce_rsag.hpp | 2 +- .../allreduce/allreduce_rsag_pipeline.hpp | 2 +- .../allreduce/allreduce_rsag_zero_copy.hpp | 2 +- .../collectives/include/allreduce/common.hpp | 92 +-- src/ext/nccl/algorithm_selector.cc | 3 +- src/ext/nccl/datatype_conversion.hpp | 5 + src/ext/nccl/nccl.cc | 39 +- 41 files changed, 1623 insertions(+), 261 deletions(-) create mode 100644 python/test/test_fp8_accum.py diff --git a/.azure-pipelines/templates/ut.yml b/.azure-pipelines/templates/ut.yml index 9d17e923..743c66e6 100644 --- a/.azure-pipelines/templates/ut.yml +++ b/.azure-pipelines/templates/ut.yml @@ -41,6 +41,7 @@ steps: displayName: Run pytests remoteScript: | mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_fp8_accum.py -x - template: stop.yml parameters: diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md index 1c966155..b4e4fcdf 100644 --- a/docs/guide/mscclpp-torch-integration.md +++ b/docs/guide/mscclpp-torch-integration.md @@ -332,7 +332,8 @@ public: size_t inputSize, size_t outputSize, mscclpp::DataType dtype, mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) { + const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->kernelFunc(ctx, input, output, inputSize, dtype, stream); }, // Context initialization function diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu index e78c4777..02df3685 100644 --- a/examples/customized-collective-algorithm/customized_allgather.cu +++ b/examples/customized-collective-algorithm/customized_allgather.cu @@ -101,7 +101,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { "allgather", "allgather", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu index d48c4410..907b3ada 100644 --- a/examples/torch-integration/customized_allgather.cu +++ b/examples/torch-integration/customized_allgather.cu @@ -69,7 +69,8 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { "allgather", "allgather", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, [[maybe_unused]] mscclpp::ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) { return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp index 65b1ab3c..531cb857 100644 --- a/include/mscclpp/algorithm.hpp +++ b/include/mscclpp/algorithm.hpp @@ -103,12 +103,14 @@ class Algorithm { /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection). /// @param symmetricMemory Whether to use symmetric memory optimization. /// @param extras Additional parameters for algorithm-specific customization. + /// @param accumDtype Data type for accumulation during reduction. DataType::AUTO resolves to dtype. /// @return The result of the operation. virtual CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, bool symmetricMemory = false, - const std::unordered_map& extras = {}) = 0; + const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) = 0; /// Reset the algorithm state, clearing any cached contexts. virtual void reset() = 0; @@ -186,10 +188,11 @@ class NativeAlgorithm : public Algorithm { /// @param nBlocks Number of CUDA blocks. /// @param nThreadsPerBlock Number of threads per block. /// @param extras Additional algorithm-specific parameters. + /// @param accumDtype Data type for accumulation (resolved from input dtype if sentinel). /// @return The result of the operation. using KernelFunc = std::function, const void*, void*, size_t, size_t, DataType, ReduceOp, - cudaStream_t, int, int, const std::unordered_map&)>; + cudaStream_t, int, int, const std::unordered_map&, DataType)>; /// Function type for creating algorithm contexts. /// @param comm The communicator. @@ -233,8 +236,8 @@ class NativeAlgorithm : public Algorithm { CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, - bool symmetricMemory = false, - const std::unordered_map& extras = {}) override; + bool symmetricMemory = false, const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) override; const std::string& name() const override; const std::string& collective() const override; const std::pair& messageRange() const override; @@ -285,8 +288,8 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, - bool symmetricMemory = false, - const std::unordered_map& extras = {}) override; + bool symmetricMemory = false, const std::unordered_map& extras = {}, + DataType accumDtype = DataType::AUTO) override; AlgorithmType type() const override { return AlgorithmType::DSL; } Constraint constraint() const override; void reset() override; diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp index 1cecbea6..fa31a28f 100644 --- a/include/mscclpp/gpu_data_types.hpp +++ b/include/mscclpp/gpu_data_types.hpp @@ -64,18 +64,151 @@ using __bfloat162 = __nv_bfloat162; #endif +/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15. +/// Format (MSB first): [sign:1][exponent:4][mantissa:3] +/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention). +/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6. +struct alignas(1) __fp8_e4m3b15 { + uint8_t __x; + + __fp8_e4m3b15() = default; + + /// Construct from raw bits (use __fp8_e4m3b15::fromRaw() for clarity). + MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(uint8_t raw) : __x(raw) {} + + /// Construct from float32 (explicit to avoid ambiguous conversion chains). + MSCCLPP_HOST_DEVICE_INLINE explicit __fp8_e4m3b15(float val) : __x(fromFloat(val)) {} + + /// Convert to float32. + MSCCLPP_HOST_DEVICE_INLINE operator float() const { return toFloat(__x); } + + /// Construct from a raw bit pattern without conversion. + static MSCCLPP_HOST_DEVICE_INLINE __fp8_e4m3b15 fromRaw(uint8_t bits) { + __fp8_e4m3b15 r; + r.__x = bits; + return r; + } + + private: + /// Decode fp8_e4m3b15 bits → float32. + /// + /// Uses bit manipulation through fp16 as intermediate, adapted from the Triton compiler. + /// fp8_e4m3b15 is identical to fp8_e4m3fn (NVIDIA) except exponent bias is 15 vs 7. + /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8, + /// then convert fp16 → float32. + static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) { + // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN. + uint32_t exp = (bits >> 3) & 0xFu; + if (bits == 0x80 || exp == 15) { + union { + uint32_t u; + float f; + } nan_val = {0x7FC00000u}; + return nan_val.f; + } + if (bits == 0) return 0.0f; + + // Triton-style bit manipulation: fp8 → fp16 → fp32. + // fp8 layout: [S:1][E:4][M:3] (bias=15) + // fp16 layout: [S:1][E:5][M:10] (bias=15) + // + // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1 + // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15. + // Refer: + // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34 + uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16 + uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position + uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign) + uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1 + + // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0 + // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents + // the subnormal fp16 value since both share bias=15. + + // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP). + union { + uint16_t u; + __half h; + } cvt = {fp16_bits}; + return __half2float(cvt.h); + } + + /// Encode float32 → fp8_e4m3b15 bits. + /// + /// Algorithm adapted from Triton: float32 → fp16 → bit-manipulate → fp8. + /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15), + /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1. + static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) { + union { + float f; + uint32_t u; + } in = {val}; + + // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz). + if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u; + + // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP). + __half h_val = __float2half_rn(val); + union { + __half h; + uint16_t u; + } cvt = {h_val}; + uint16_t fp16_bits = cvt.u; + + // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80. + uint16_t abs_fp16 = fp16_bits & 0x7FFFu; + if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u; + + // Reconstruct with sign. + uint16_t sign16 = fp16_bits & 0x8000u; + + // Triton-style: fp16 → fp8. + // fp16 layout: [S:1][E:5][M:10] (bias=15) + // fp8 layout: [S:1][E:4][M:3] (bias=15) + // + // mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080) + // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias. + // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0 + // Finally: prmt for byte extraction. + // + // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte. + uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u); + // The upper byte now contains [E:4][M:3][round_bit]. + // Combine with sign and extract. + uint16_t with_sign = sign16 | adjusted; + uint8_t result = (uint8_t)(with_sign >> 8); + + // Zero → 0x00 (ensure positive zero, not negative zero which is NaN). + if ((result & 0x7Fu) == 0) result = 0x00u; + + return result; + } +}; + +/// Packed 2x fp8_e4m3b15 storage. +struct alignas(2) __fp8x2_e4m3b15 { + uint16_t __x; +}; + +/// Packed 4x fp8_e4m3b15 storage. +struct alignas(4) __fp8x4_e4m3b15 { + uint32_t __x; +}; + namespace mscclpp { /// Data types supported by mscclpp operations. enum class DataType { - INT32, // 32-bit signed integer. - UINT32, // 32-bit unsigned integer. - FLOAT16, // IEEE 754 half precision. - FLOAT32, // IEEE 754 single precision. - BFLOAT16, // bfloat16 precision. - FLOAT8_E4M3, // float8 with E4M3 layout. - FLOAT8_E5M2, // float8 with E5M2 layout. - UINT8, // 8-bit unsigned integer. + INT32, // 32-bit signed integer. + UINT32, // 32-bit unsigned integer. + FLOAT16, // IEEE 754 half precision. + FLOAT32, // IEEE 754 single precision. + BFLOAT16, // bfloat16 precision. + FLOAT8_E4M3, // float8 with E4M3 layout. + FLOAT8_E5M2, // float8 with E5M2 layout. + UINT8, // 8-bit unsigned integer. + FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel). + AUTO = 255, // Sentinel: resolve to the input dtype at runtime. }; /// Word array. @@ -97,6 +230,7 @@ struct alignas(Bytes) Words {}; template union alignas(sizeof(T) * N) VectorTypeImpl { static_assert(N > 0, "N must be greater than 0"); + static_assert(sizeof(StorageT) >= sizeof(T) * N, "StorageT must cover the full vector size"); T data[N]; Words words; @@ -127,13 +261,14 @@ union alignas(sizeof(T) * N) VectorTypeImpl { MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; } }; -// Helper template to get the appropriate vector type for a given element type and count +// Helper template to get the appropriate vector type for a given element type and count. template struct VectorTypeHelper { - using type = - VectorTypeImpl>>; + static constexpr int Bytes = N * sizeof(T); + using type = VectorTypeImpl< + T, N, + std::conditional_t>>>>; }; /// Vector type - clean user interface (automatically selects appropriate storage type) @@ -170,6 +305,11 @@ DEFINE_VEC(bf16x4, __bfloat16, 4, uint2); DEFINE_VEC(f16x8, __half, 8, uint4); DEFINE_VEC(bf16x8, __bfloat16, 8, uint4); +// Aliases for large vector types (>16 bytes) where no native CUDA storage type exists. +using f32x8 = VectorType; +using f32x16 = VectorType; +using f16x16 = VectorType<__half, 16>; + #if defined(__FP8_TYPES_EXIST__) DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3); DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3); @@ -181,6 +321,12 @@ DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2); DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2); DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4); #endif + +// fp8_e4m3b15 vectors (always available — software type, no HW dependency) +DEFINE_VEC(f8_e4m3b15x2, __fp8_e4m3b15, 2, __fp8x2_e4m3b15); +DEFINE_VEC(f8_e4m3b15x4, __fp8_e4m3b15, 4, __fp8x4_e4m3b15); +DEFINE_VEC(f8_e4m3b15x8, __fp8_e4m3b15, 8, uint2); +DEFINE_VEC(f8_e4m3b15x16, __fp8_e4m3b15, 16, uint4); #undef DEFINE_VEC #if defined(MSCCLPP_DEVICE_COMPILE) @@ -254,6 +400,21 @@ MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) { } #endif +// --- f32x2 arithmetic --- + +template +MSCCLPP_DEVICE_INLINE f32x2 operator+(const f32x2& a, const f32x2& b) { +#if defined(MSCCLPP_DEVICE_CUDA) && (__CUDA_ARCH__ >= 1000) + // Blackwell (SM 10.0+): packed float2 add in a single instruction. + return __fadd2_rn(a.storage, b.storage); +#else + f32x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + template MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) { __half2 result; @@ -265,6 +426,18 @@ MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) { return result; } +template +MSCCLPP_DEVICE_INLINE f16x4 operator+(const f16x4& a, const f16x4& b) { + // Decompose into 2× packed __hadd2 (2 instructions instead of 4 scalar __hadd). + const f16x2* a2 = reinterpret_cast(&a); + const f16x2* b2 = reinterpret_cast(&b); + f16x4 result; + f16x2* r2 = reinterpret_cast(&result); + r2[0] = a2[0] + b2[0]; + r2[1] = a2[1] + b2[1]; + return result; +} + template MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) { __bfloat162 result; @@ -449,6 +622,14 @@ MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) { return (a < b ? a : b); } +template <> +MSCCLPP_DEVICE_INLINE f32x2 min(const f32x2& a, const f32x2& b) { + f32x2 result; + result.data[0] = fminf(a.data[0], b.data[0]); + result.data[1] = fminf(a.data[1], b.data[1]); + return result; +} + template <> MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) { #if defined(MSCCLPP_DEVICE_HIP) @@ -489,6 +670,51 @@ MSCCLPP_DEVICE_INLINE u8x4 min(const u8x4& a, const u8x4& b) { #endif } +/// Convert a vector type From to vector type To. +/// Primary template with auto-decomposition: vectors with N > 4 elements decompose into x4 chunks, +/// vectors with N == 4 decompose into x2 chunks, enabling optimized x2/x4 specializations to be reached. +/// Specialized below for optimized FP8 conversion paths at x2/x4 level. +template +MSCCLPP_DEVICE_INLINE To to(const From& v) { + static_assert(To::Size == From::Size, "to: vector sizes must match"); + constexpr int N = From::Size; + + // Auto-decompose: N > 4 → split into x4 chunks + if constexpr (N > 4 && N % 4 == 0) { + constexpr int nChunks = N / 4; + using FromChunk = VectorType; + using ToChunk = VectorType; + const FromChunk* in = reinterpret_cast(&v); + To result; + ToChunk* out = reinterpret_cast(&result); +#pragma unroll + for (int c = 0; c < nChunks; ++c) { + out[c] = to(in[c]); + } + return result; + } + // Auto-decompose: N == 4 → split into 2x x2 chunks + else if constexpr (N == 4) { + using FromChunk = VectorType; + using ToChunk = VectorType; + const FromChunk* in = reinterpret_cast(&v); + To result; + ToChunk* out = reinterpret_cast(&result); + out[0] = to(in[0]); + out[1] = to(in[1]); + return result; + } + // Base case: element-wise conversion + else { + To result; +#pragma unroll + for (int i = 0; i < N; ++i) { + result.data[i] = static_cast(v.data[i]); + } + return result; + } +} + #if defined(__FP8_TYPES_EXIST__) template <> MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) { @@ -551,7 +777,526 @@ MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) { return result; } + +// --- f8_e4m3 -> f32 specializations --- + +/// f8_e4m3x2 -> f32x2. +/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float. +/// HIP gfx942: fp8 -> float (via __builtin_amdgcn_cvt_pk_f32_fp8). +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e4m3x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0); + f32x2 result; + result.data[0] = f[0]; + result.data[1] = f[1]; + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3); + f32x2 result; + result.data[0] = __half2float(bit_cast<__half>(h2.x)); + result.data[1] = __half2float(bit_cast<__half>(h2.y)); + return result; +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e4m3x4 -> f32x4. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e4m3x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto lo = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, false); + auto hi = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, true); + f32x4 result; + result.data[0] = lo[0]; + result.data[1] = lo[1]; + result.data[2] = hi[0]; + result.data[3] = hi[1]; + return result; +#else + const f8_e4m3x2* pair = reinterpret_cast(&v); + f32x2 lo = to(pair[0]); + f32x2 hi = to(pair[1]); + f32x4 result; + result.data[0] = lo.data[0]; + result.data[1] = lo.data[1]; + result.data[2] = hi.data[0]; + result.data[3] = hi.data[1]; + return result; +#endif +} + +// --- f8_e5m2 -> f32 specializations --- + +/// f8_e5m2x2 -> f32x2. +/// NVIDIA: fp8 -> half (via __nv_cvt_fp8x2_to_halfraw2) -> float. +/// HIP gfx942: bf8 -> float (via __builtin_amdgcn_cvt_pk_f32_bf8). +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e5m2x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, 0); + f32x2 result; + result.data[0] = f[0]; + result.data[1] = f[1]; + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E5M2); + f32x2 result; + result.data[0] = __half2float(bit_cast<__half>(h2.x)); + result.data[1] = __half2float(bit_cast<__half>(h2.y)); + return result; +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e5m2x4 -> f32x4. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e5m2x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto lo = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, false); + auto hi = __builtin_amdgcn_cvt_pk_f32_bf8(v.storage.__x, true); + f32x4 result; + result.data[0] = lo[0]; + result.data[1] = lo[1]; + result.data[2] = hi[0]; + result.data[3] = hi[1]; + return result; +#else + const f8_e5m2x2* pair = reinterpret_cast(&v); + f32x2 lo = to(pair[0]); + f32x2 hi = to(pair[1]); + f32x4 result; + result.data[0] = lo.data[0]; + result.data[1] = lo.data[1]; + result.data[2] = hi.data[0]; + result.data[3] = hi.data[1]; + return result; +#endif +} + +// --- f32 -> f8_e4m3 specializations (downcast) --- + +/// f32x2 -> f8_e4m3x2. +/// HIP gfx942: float -> fp8 (via __builtin_amdgcn_cvt_pk_fp8_f32). +/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2). +/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2; + h2.x = bit_cast(__float2half_rn(v.data[0])); + h2.y = bit_cast(__float2half_rn(v.data[1])); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(__float2half_rn(v.data[0])); + h1.x = bit_cast(__float2half_rn(v.data[1])); + f8_e4m3x2 result; + result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3)); + result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3)); + return result; +#else + f8_e4m3x2 result; + result.data[0] = static_cast<__fp8_e4m3>(v.data[0]); + result.data[1] = static_cast<__fp8_e4m3>(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e4m3x4. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[0], v.data[1], 0, false); + packed = __builtin_amdgcn_cvt_pk_fp8_f32(v.data[2], v.data[3], packed, true); + return bit_cast(packed); +#else + f32x2 lo, hi; + lo.data[0] = v.data[0]; + lo.data[1] = v.data[1]; + hi.data[0] = v.data[2]; + hi.data[1] = v.data[3]; + f8_e4m3x2 lo_fp8 = to(lo); + f8_e4m3x2 hi_fp8 = to(hi); + f8_e4m3x4 result; + result.data[0] = lo_fp8.data[0]; + result.data[1] = lo_fp8.data[1]; + result.data[2] = hi_fp8.data[0]; + result.data[3] = hi_fp8.data[1]; + return result; +#endif +} + +// --- f32 -> f8_e5m2 specializations (downcast) --- + +/// f32x2 -> f8_e5m2x2. +/// HIP gfx942: float -> bf8 (via __builtin_amdgcn_cvt_pk_bf8_f32). +/// NVIDIA SM90+: float -> half -> fp8 (via __nv_cvt_halfraw2_to_fp8x2 with __NV_E5M2). +/// NVIDIA pre-SM90: float -> half -> fp8 (via __nv_cvt_halfraw_to_fp8, element-wise). +template <> +MSCCLPP_DEVICE_INLINE f8_e5m2x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2; + h2.x = bit_cast(__float2half_rn(v.data[0])); + h2.y = bit_cast(__float2half_rn(v.data[1])); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E5M2); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(__float2half_rn(v.data[0])); + h1.x = bit_cast(__float2half_rn(v.data[1])); + f8_e5m2x2 result; + result.data[0] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E5M2)); + result.data[1] = bit_cast<__fp8_e5m2>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E5M2)); + return result; +#else + f8_e5m2x2 result; + result.data[0] = static_cast<__fp8_e5m2>(v.data[0]); + result.data[1] = static_cast<__fp8_e5m2>(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e5m2x4. +template <> +MSCCLPP_DEVICE_INLINE f8_e5m2x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + uint32_t packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[0], v.data[1], 0, false); + packed = __builtin_amdgcn_cvt_pk_bf8_f32(v.data[2], v.data[3], packed, true); + return bit_cast(packed); +#else + f32x2 lo, hi; + lo.data[0] = v.data[0]; + lo.data[1] = v.data[1]; + hi.data[0] = v.data[2]; + hi.data[1] = v.data[3]; + f8_e5m2x2 lo_fp8 = to(lo); + f8_e5m2x2 hi_fp8 = to(hi); + f8_e5m2x4 result; + result.data[0] = lo_fp8.data[0]; + result.data[1] = lo_fp8.data[1]; + result.data[2] = hi_fp8.data[0]; + result.data[3] = hi_fp8.data[1]; + return result; +#endif +} + +// --- f8_e4m3 <-> f16 conversion specializations --- + +/// f8_e4m3x2 -> f16x2. +/// NVIDIA SM90+: packed intrinsic (1 instruction). +/// HIP gfx942: fp8 -> float -> half (via AMD builtin). +/// Pre-SM90 / fallback: element-wise scalar conversion. +template <> +MSCCLPP_DEVICE_INLINE f16x2 to(const f8_e4m3x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + auto f = __builtin_amdgcn_cvt_pk_f32_fp8(v.storage.__x, 0); + f16x2 result; + result.data[0] = __float2half(f[0]); + result.data[1] = __float2half(f[1]); + return result; +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = __nv_cvt_fp8x2_to_halfraw2(bit_cast<__nv_fp8x2_storage_t>(v.storage), __NV_E4M3); + return bit_cast(h2); +#else + f16x2 result; + result.data[0] = static_cast<__half>(v.data[0]); + result.data[1] = static_cast<__half>(v.data[1]); + return result; +#endif +} + +/// f16x2 -> f8_e4m3x2. +/// NVIDIA SM90+: packed intrinsic (1 instruction). +/// HIP gfx942: half -> float -> fp8 (via AMD builtin). +/// Pre-SM90: element-wise scalar conversion. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3x2 to(const f16x2& v) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float f0 = __half2float(v.data[0]); + float f1 = __half2float(v.data[1]); + uint32_t packed = __builtin_amdgcn_cvt_pk_fp8_f32(f0, f1, 0, false); + return bit_cast(static_cast<__hip_fp8x2_storage_t>(packed)); +#elif defined(MSCCLPP_DEVICE_CUDA) && __CUDA_ARCH__ >= 900 + __half2_raw h2 = bit_cast<__half2_raw>(v); + __nv_fp8x2_storage_t fp8x2 = __nv_cvt_halfraw2_to_fp8x2(h2, __NV_SATFINITE, __NV_E4M3); + return bit_cast(fp8x2); +#elif defined(MSCCLPP_DEVICE_CUDA) + __half_raw h0, h1; + h0.x = bit_cast(v.data[0]); + h1.x = bit_cast(v.data[1]); + f8_e4m3x2 result; + result.data[0] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h0, __NV_SATFINITE, __NV_E4M3)); + result.data[1] = bit_cast<__fp8_e4m3>(__nv_cvt_halfraw_to_fp8(h1, __NV_SATFINITE, __NV_E4M3)); + return result; +#else + f8_e4m3x2 result; + result.data[0] = static_cast<__fp8_e4m3>(v.data[0]); + result.data[1] = static_cast<__fp8_e4m3>(v.data[1]); + return result; +#endif +} + #endif // defined(__FP8_TYPES_EXIST__) + +// --- fp8_e4m3b15 <-> fp16 direct conversion specializations --- +// These are the PRIMARY conversions: fp8_b15 <-> fp16 is just a 1-bit exponent shift +// (E4 bias=15 <-> E5 bias=15), no precision loss since fp16 has 10 mantissa bits +// vs fp8's 3. fp32 conversions are derived by routing through fp16. + +/// f8_e4m3b15x2 -> f16x2. +/// Direct fp8 -> fp16 via branch-free bit manipulation. +template <> +MSCCLPP_DEVICE_INLINE f16x2 to(const f8_e4m3b15x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint16_t in = v.storage.__x; + // Spread 2 fp8 bytes into packed fp16 pair, adjust exponent E4->E5. + uint32_t a0 = ((uint32_t)(in & 0xFFu) << 8) | ((uint32_t)(in >> 8) << 24); + uint32_t b0 = (a0 & 0x7f007f00u) >> 1; + uint32_t out0 = b0 | (a0 & 0x80008000u); + __half2 h; + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h)) : "r"(out0)); + return h; +#else + f16x2 result; + result.data[0] = __float2half(float(v.data[0])); + result.data[1] = __float2half(float(v.data[1])); + return result; +#endif +} + +/// f8_e4m3b15x4 -> f16x4. +/// Uses __byte_perm + lop3 for branch-free vectorized conversion. +template <> +MSCCLPP_DEVICE_INLINE f16x4 to(const f8_e4m3b15x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in = v.storage.__x; + uint32_t a0 = __byte_perm(0u, in, 0x5746u); + uint32_t a0_shr = a0 >> 1; + uint32_t a0_sign = a0 & 0x80008000u; + uint32_t out0; + asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out0) : "r"(a0_shr), "r"(0x3f803f80u), "r"(a0_sign)); + uint32_t a1 = __byte_perm(a0, 0u, 0x2301u); + uint32_t a1_shr = a1 >> 1; + uint32_t a1_sign = a1 & 0x80008000u; + uint32_t out1; + asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(out1) : "r"(a1_shr), "r"(0x3f803f80u), "r"(a1_sign)); + f16x4 result; + asm("mov.b32 %0, %1;" : "=r"(result.words[0]) : "r"(out0)); + asm("mov.b32 %0, %1;" : "=r"(result.words[1]) : "r"(out1)); + return result; +#else + f16x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __float2half(float(v.data[i])); + } + return result; +#endif +} + +/// f16x2 -> f8_e4m3b15x2. +/// Direct fp16 -> fp8 via clamp + exponent shift E5->E4 + pack. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f16x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in0; + asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast(&v))); + // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16). + uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16; + uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu; + alo = alo < 0x3B80u ? alo : 0x3B80u; + ahi = ahi < 0x3B80u ? ahi : 0x3B80u; + uint32_t a0 = alo | (ahi << 16); + a0 = a0 * 2u + 0x00800080u; + uint32_t b0 = a0 | (in0 & 0x80008000u); + uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u)); + return bit_cast(packed); +#else + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(__half2float(v.data[0])); + result.data[1] = __fp8_e4m3b15(__half2float(v.data[1])); + return result; +#endif +} + +/// f16x4 -> f8_e4m3b15x4. +/// Uses __vminu2 + lop3 + __byte_perm for branch-free vectorized conversion. +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f16x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + uint32_t in0, in1; + asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(v.words[0])); + asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1])); + uint32_t abs0 = in0 & 0x7fff7fffu; + uint32_t abs1 = in1 & 0x7fff7fffu; + uint32_t a0 = __vminu2(abs0, 0x3B803B80u); + uint32_t a1 = __vminu2(abs1, 0x3B803B80u); + a0 = a0 * 2u + 0x00800080u; + a1 = a1 * 2u + 0x00800080u; + uint32_t b0, b1; + asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b0) : "r"(a0), "r"(in0), "r"(0x80008000u)); + asm("lop3.b32 %0, %1, %2, %3, 0xf8;" : "=r"(b1) : "r"(a1), "r"(in1), "r"(0x80008000u)); + uint32_t packed = __byte_perm(b0, b1, 0x7531u); + return bit_cast(packed); +#else + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(__half2float(v.data[i])); + } + return result; +#endif +} + +// --- fp8_e4m3b15 <-> f32 conversion specializations --- +// Derived from fp16 conversions: fp8→f32 = fp8→fp16→f32, f32→fp8 = f32→fp16→fp8. + +/// f8_e4m3b15x2 -> f32x2. +/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32. +template <> +MSCCLPP_DEVICE_INLINE f32x2 to(const f8_e4m3b15x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + f16x2 h = to(v); + float2 f2 = __half22float2(h); + return bit_cast(f2); +#else + f32x2 result; + result.data[0] = float(v.data[0]); + result.data[1] = float(v.data[1]); + return result; +#endif +} + +/// f8_e4m3b15x4 -> f32x4. +/// Routes through fp16: fp8→fp16 (bit manip) then fp16→f32. +template <> +MSCCLPP_DEVICE_INLINE f32x4 to(const f8_e4m3b15x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + f16x4 h = to(v); + __half2 h0, h1; + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h0)) : "r"(h.words[0])); + asm("mov.b32 %0, %1;" : "=r"(*reinterpret_cast(&h1)) : "r"(h.words[1])); + float2 f0 = __half22float2(h0); + float2 f1 = __half22float2(h1); + f32x4 result; + result.data[0] = f0.x; + result.data[1] = f0.y; + result.data[2] = f1.x; + result.data[3] = f1.y; + return result; +#else + f32x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = float(v.data[i]); + } + return result; +#endif +} + +/// f32x2 -> f8_e4m3b15x2. +/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f32x2& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + float2 f2 = {v.data[0], v.data[1]}; + __half2 h = __float22half2_rn(f2); + return to(h); +#else + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(v.data[0]); + result.data[1] = __fp8_e4m3b15(v.data[1]); + return result; +#endif +} + +/// f32x4 -> f8_e4m3b15x4. +/// Routes through fp16: f32→fp16 then fp16→fp8 (clamp + exponent shift + pack). +template <> +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f32x4& v) { +#if defined(MSCCLPP_DEVICE_CUDA) + float2 f01 = {v.data[0], v.data[1]}; + float2 f23 = {v.data[2], v.data[3]}; + __half2 h01 = __float22half2_rn(f01); + __half2 h23 = __float22half2_rn(f23); + f16x4 h; + asm("mov.b32 %0, %1;" : "=r"(h.words[0]) : "r"(*reinterpret_cast(&h01))); + asm("mov.b32 %0, %1;" : "=r"(h.words[1]) : "r"(*reinterpret_cast(&h23))); + return to(h); +#else + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(v.data[i]); + } + return result; +#endif +} + +// --- fp8_e4m3b15 arithmetic (software, always available) --- + +template +MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 operator+(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) { + return __fp8_e4m3b15(float(a) + float(b)); +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 operator+(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) { + f8_e4m3b15x2 result; + result.data[0] = __fp8_e4m3b15(float(a.data[0]) + float(b.data[0])); + result.data[1] = __fp8_e4m3b15(float(a.data[1]) + float(b.data[1])); + return result; +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 operator+(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) { + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = __fp8_e4m3b15(float(a.data[i]) + float(b.data[i])); + } + return result; +} + +// --- fp8_e4m3b15 min (software) --- + +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3b15 min(const __fp8_e4m3b15& a, const __fp8_e4m3b15& b) { + return __fp8_e4m3b15(fminf(float(a), float(b))); +} + +MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 min(const f8_e4m3b15x2& a, const f8_e4m3b15x2& b) { + f8_e4m3b15x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 min(const f8_e4m3b15x4& a, const f8_e4m3b15x4& b) { + f8_e4m3b15x4 result; +#pragma unroll + for (int i = 0; i < 4; ++i) { + result.data[i] = mscclpp::min(a.data[i], b.data[i]); + } + return result; +} + #endif // MSCCLPP_DEVICE_COMPILE } // namespace mscclpp diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp index 1a93cbc0..1cb3f253 100644 --- a/python/csrc/algorithm.cpp +++ b/python/csrc/algorithm.cpp @@ -75,15 +75,17 @@ void register_algorithm(nb::module_& m) { [](Algorithm& self, std::shared_ptr comm, uintptr_t input, uintptr_t output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream, std::shared_ptr executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory, - std::unordered_map extras) { + std::unordered_map extras, int32_t accumDtype) { return self.execute(comm, reinterpret_cast(input), reinterpret_cast(output), inputSize, outputSize, dtype, op, reinterpret_cast(stream), executor, - nBlocks, nThreadsPerBlock, symmetricMemory, extras); + nBlocks, nThreadsPerBlock, symmetricMemory, extras, + static_cast(accumDtype)); }, nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"), nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr, nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false, - nb::arg("extras") = std::unordered_map()) + nb::arg("extras") = std::unordered_map(), + nb::arg("accum_dtype") = static_cast(DataType::AUTO)) .def("reset", &Algorithm::reset); nb::class_(algorithmClass, "Constraint") diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index 47d76ac4..b8649564 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -47,7 +47,8 @@ void register_core(nb::module_& m) { .value("bfloat16", DataType::BFLOAT16) .value("float8_e4m3", DataType::FLOAT8_E4M3) .value("float8_e5m2", DataType::FLOAT8_E5M2) - .value("uint8", DataType::UINT8); + .value("uint8", DataType::UINT8) + .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15); nb::class_(m, "CppBootstrap") .def("get_rank", &Bootstrap::getRank) diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp index 6995756b..60880456 100644 --- a/python/csrc/gpu_utils_py.cpp +++ b/python/csrc/gpu_utils_py.cpp @@ -34,6 +34,19 @@ static DLDataType getDlType(std::string type) { return DLDataType{kDLBfloat, 16, 1}; } else if (type == "torch.float16") { return DLDataType{kDLFloat, 16, 1}; + } else if (type == "torch.float8_e4m3fn") { + return DLDataType{kDLFloat8_e4m3fn, 8, 1}; + } else if (type == "torch.float8_e4m3fnuz") { + return DLDataType{kDLFloat8_e4m3fnuz, 8, 1}; + } else if (type == "torch.float8_e5m2") { + return DLDataType{kDLFloat8_e5m2, 8, 1}; + } else if (type == "torch.float8_e5m2fnuz") { + return DLDataType{kDLFloat8_e5m2fnuz, 8, 1}; + } else if (type == "torch.uint8") { + return DLDataType{kDLUInt, 8, 1}; + } else if (type == "fp8_e4m3b15") { + // No standard DLPack code for fp8_e4m3b15; store as raw uint8 bytes. + return DLDataType{kDLUInt, 8, 1}; } else { throw Error("Unsupported type: " + type, ErrorCode::InvalidUsage); } diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py index 744cf39e..f12a3027 100644 --- a/python/mscclpp/_core/algorithm.py +++ b/python/mscclpp/_core/algorithm.py @@ -177,6 +177,7 @@ class Algorithm: nthreads_per_block=0, symmetric_memory: bool = False, extras: Optional[Dict[str, int]] = None, + accum_dtype: Optional[CppDataType] = None, ) -> int: """Execute the collective algorithm. @@ -194,10 +195,14 @@ class Algorithm: nthreads_per_block: Number of threads per block (0 for auto-selection). symmetric_memory: Whether to use symmetric memory optimization (default: False). extras: Additional algorithm-specific parameters. + accum_dtype: Data type for accumulation during reduction. If None, defaults to + the same as dtype. Use DataType.float32 for high-precision FP8 accumulation. Returns: The result code (0 for success). """ + merged_extras = dict(extras) if extras is not None else {} + accum_dtype = accum_dtype if accum_dtype is not None else dtype return self._algorithm.execute( comm, int(input_buffer), @@ -211,7 +216,8 @@ class Algorithm: nblocks, nthreads_per_block, symmetric_memory, - extras if extras is not None else {}, + merged_extras, + int(accum_dtype), ) def reset(self): diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py new file mode 100644 index 00000000..3a6c67f1 --- /dev/null +++ b/python/test/test_fp8_accum.py @@ -0,0 +1,391 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Correctness test for FP8 allreduce with different accumulation types. +# +# Verifies that FP8 allreduce with higher-precision accumulation produces +# results at least as accurate as native FP8 accumulation, by comparing +# against a float32 reference. +# +# Usage: +# mpirun -np 8 pytest python/test/test_fp8_accum.py -v + +import cupy as cp +import numpy as np +import pytest + +from mscclpp import CommGroup, GpuBuffer, DataType, ReduceOp, is_nvls_supported +from mscclpp.ext import AlgorithmCollectionBuilder +from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group + +# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs. +# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed. +_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip +# TODO(binyli): Skip hip for now, will fix it in the next PR +_skip_fp8 = _is_hip or int(cp.cuda.Device().compute_capability) < 89 +pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA (HIP not yet supported)") + +# --------------------------------------------------------------------------- +# FP8 E4M3FN helpers (bias=7, no infinity, NaN = exp=15 & mant=7) +# --------------------------------------------------------------------------- + + +def e4m3fn_to_float(uint8_array): + """Decode a cupy uint8 array of E4M3FN bit patterns to float32.""" + bits = uint8_array.astype(cp.int32) + sign = (bits >> 7) & 1 + exp = (bits >> 3) & 0xF + mant = bits & 0x7 + + # Normal: (-1)^s * 2^(exp-7) * (1 + mant/8) + normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 7).astype(cp.int32)) + # Subnormal (exp==0): (-1)^s * 2^(-6) * (mant/8) + subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-6)) + + result = cp.where(exp == 0, subnormal_val, normal_val) + result = cp.where(sign == 1, -result, result) + # Zero + result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result) + # NaN: exp==15 & mant==7 + nan_mask = (exp == 15) & (mant == 7) + result = cp.where(nan_mask, cp.float32(float("nan")), result) + return result + + +def float_to_e4m3fn(f32_array, chunk_size=65536): + """Encode a cupy float32 array to uint8 E4M3FN bit patterns. + + Uses a lookup-table approach: precompute all 128 positive E4M3FN values, + then find nearest match per element via chunked broadcast comparison. + """ + # Build lookup table of all 128 positive E4M3FN values (0x00..0x7F) + all_bytes = cp.arange(128, dtype=cp.uint8) + all_floats = e4m3fn_to_float(all_bytes) # (128,) float32 + # Mark NaN entries as inf so they're never selected as nearest + all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) + + # Clamp input and extract sign + clamped = f32_array.astype(cp.float32) + clamped = cp.clip(clamped, -448.0, 448.0) + signs = (clamped < 0).astype(cp.uint8) + absval = cp.abs(clamped) + + result = cp.zeros(absval.shape, dtype=cp.uint8) + n = absval.size + absval_flat = absval.ravel() + result_flat = result.ravel() + + for start in range(0, n, chunk_size): + end = min(start + chunk_size, n) + chunk = absval_flat[start:end] + # (chunk_size, 128) difference matrix + diffs = cp.abs(chunk[:, None] - all_floats[None, :]) + result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8) + + # Combine with sign bit + result = result_flat.reshape(absval.shape) + result = result | (signs << 7) + # Handle exact zero + result = cp.where(absval == 0, cp.uint8(0), result) + return result + + +# --------------------------------------------------------------------------- +# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80) +# --------------------------------------------------------------------------- + + +def e4m3b15_to_float(uint8_array): + """Decode a cupy uint8 array of E4M3B15 bit patterns to float32.""" + bits = uint8_array.astype(cp.int32) + sign = (bits >> 7) & 1 + exp = (bits >> 3) & 0xF + mant = bits & 0x7 + + # Normal: (-1)^s * 2^(exp-15) * (1 + mant/8) + normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 15).astype(cp.int32)) + # Subnormal (exp==0): (-1)^s * 2^(-14) * (mant/8) + subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-14)) + + result = cp.where(exp == 0, subnormal_val, normal_val) + result = cp.where(sign == 1, -result, result) + # Zero + result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result) + # NaN: exp==15 or negative zero (0x80) + nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80) + result = cp.where(nan_mask, cp.float32(float("nan")), result) + return result + + +def float_to_e4m3b15(f32_array, chunk_size=65536): + """Encode a cupy float32 array to uint8 E4M3B15 bit patterns. + + Same lookup-table approach as float_to_e4m3fn. + """ + # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F) + all_bytes = cp.arange(128, dtype=cp.uint8) + all_floats = e4m3b15_to_float(all_bytes) # (128,) float32 + # Mark NaN entries as inf so they're never selected as nearest + all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) + + # Clamp input and extract sign + clamped = f32_array.astype(cp.float32) + clamped = cp.clip(clamped, -0.9375, 0.9375) + signs = (clamped < 0).astype(cp.uint8) + absval = cp.abs(clamped) + + result = cp.zeros(absval.shape, dtype=cp.uint8) + n = absval.size + absval_flat = absval.ravel() + result_flat = result.ravel() + + for start in range(0, n, chunk_size): + end = min(start + chunk_size, n) + chunk = absval_flat[start:end] + # (chunk_size, 128) difference matrix + diffs = cp.abs(chunk[:, None] - all_floats[None, :]) + result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8) + + # Combine with sign bit + result = result_flat.reshape(absval.shape) + result = result | (signs << 7) + # Handle exact zero + result = cp.where(absval == 0, cp.uint8(0), result) + return result + + +# --------------------------------------------------------------------------- +# Shared test helpers +# --------------------------------------------------------------------------- + + +def setup_algorithms(mpi_group): + """Build default algorithms and return (comm_group, algo_map, scratch_buf).""" + comm_group = CommGroup(mpi_group.comm) + scratch = GpuBuffer(1 << 27, dtype=cp.uint8) # 128 MB + AlgorithmCollectionBuilder.reset() + builder = AlgorithmCollectionBuilder() + algorithms = builder.build_default_algorithms( + scratch_buffer=scratch.data.ptr, + scratch_buffer_size=scratch.nbytes, + rank=comm_group.my_rank, + ) + algo_map = {a.name: a for a in algorithms} + return comm_group, algo_map, scratch + + +def run_allreduce(algo, comm_group, buffer, dtype, accum_dtype=None, nblocks=0, nthreads_per_block=0): + """Run allreduce in-place on buffer and return a copy of the result.""" + ret = algo.execute( + comm=comm_group.communicator, + input_buffer=buffer.data.ptr, + output_buffer=buffer.data.ptr, + input_size=buffer.nbytes, + output_size=buffer.nbytes, + dtype=dtype, + op=ReduceOp.SUM, + stream=cp.cuda.get_current_stream().ptr, + nblocks=nblocks, + nthreads_per_block=nthreads_per_block, + symmetric_memory=True, + accum_dtype=accum_dtype, + ) + cp.cuda.Device().synchronize() + assert ret == 0, f"Allreduce failed with error code {ret}" + return buffer.copy() + + +# --------------------------------------------------------------------------- +# Test: FP8 E4M3 accumulation correctness +# --------------------------------------------------------------------------- + + +@parametrize_mpi_groups(8) +@pytest.mark.parametrize( + "algo_name", + [ + "default_allreduce_packet", + "default_allreduce_nvls_packet", + "default_allreduce_fullmesh", + "default_allreduce_rsag_zero_copy", + ], +) +@pytest.mark.parametrize("size", [1024, 4096, 16384, 65536, 262144, 1048576]) +def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): + """Verify that FP8 E4M3 allreduce with higher-precision accumulation is at + least as accurate as native FP8 accumulation, across all algorithm variants.""" + rank = mpi_group.comm.rank + world_size = mpi_group.comm.size + + comm_group, algo_map, scratch = setup_algorithms(mpi_group) + if algo_name not in algo_map: + pytest.skip(f"{algo_name} not available") + algo = algo_map[algo_name] + + buf = GpuBuffer(size, dtype=cp.uint8) + + accum_configs = [ + ("fp8_native", DataType.float8_e4m3), + ("float16", DataType.float16), + ("float32", DataType.float32), + ] + + # rsag_zero_copy and fullmesh need explicit block/thread counts + if "rsag" in algo_name: + nb = max(1, min(32, size // (world_size * 32))) + nt = 1024 + elif "fullmesh" in algo_name: + nb = 35 + nt = 512 + else: + nb = 0 + nt = 0 + + errors = {} + for accum_label, accum_dtype in accum_configs: + # Generate deterministic per-rank data + cp.random.seed(42 + rank) + src_f32 = cp.random.randn(size).astype(cp.float32) + src_f32 = cp.clip(src_f32, -240.0, 240.0) + src_fp8 = float_to_e4m3fn(src_f32) + + # Copy into symmetric buffer + buf[:] = src_fp8 + cp.cuda.Device().synchronize() + + # Run allreduce + result = run_allreduce( + algo, + comm_group, + buf, + dtype=DataType.float8_e4m3, + accum_dtype=accum_dtype, + nblocks=nb, + nthreads_per_block=nt, + ) + result_f32 = e4m3fn_to_float(result) + + # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32 + ref_f32 = cp.zeros(size, dtype=cp.float32) + for r in range(world_size): + cp.random.seed(42 + r) + rank_data = cp.random.randn(size).astype(cp.float32) + rank_data = cp.clip(rank_data, -240.0, 240.0) + rank_data_fp8 = float_to_e4m3fn(rank_data) + ref_f32 += e4m3fn_to_float(rank_data_fp8) + + # Compute errors + abs_err = cp.abs(result_f32 - ref_f32) + mean_abs_err = float(cp.mean(abs_err)) + errors[accum_label] = mean_abs_err + + # Reset between runs + algo.reset() + + # Higher-precision accumulation should be at least as accurate as native fp8 + assert ( + errors["float16"] <= errors["fp8_native"] + 1e-6 + ), f"float16 accum ({errors['float16']:.6f}) worse than native ({errors['fp8_native']:.6f})" + assert ( + errors["float32"] <= errors["fp8_native"] + 1e-6 + ), f"float32 accum ({errors['float32']:.6f}) worse than native ({errors['fp8_native']:.6f})" + + +# --------------------------------------------------------------------------- +# Test: FP8 E4M3B15 accumulation correctness +# --------------------------------------------------------------------------- + + +@parametrize_mpi_groups(8) +@pytest.mark.parametrize( + "algo_name", + [ + "default_allreduce_packet", + "default_allreduce_nvls_packet", + "default_allreduce_rsag_zero_copy", + ], +) +@pytest.mark.parametrize("size", [1024, 4096, 65536]) +def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int): + """Verify that FP8 E4M3B15 allreduce with higher-precision accumulation is at + least as accurate as native E4M3B15 accumulation.""" + rank = mpi_group.comm.rank + world_size = mpi_group.comm.size + + comm_group, algo_map, scratch = setup_algorithms(mpi_group) + if algo_name not in algo_map: + pytest.skip(f"{algo_name} not available") + + algo = algo_map[algo_name] + buf = GpuBuffer(size, dtype=cp.uint8) + + accum_configs = [ + ("e4m3b15_native", DataType.float8_e4m3b15), + ("float16", DataType.float16), + ("float32", DataType.float32), + ] + + # rsag_zero_copy needs explicit block/thread counts, scaled to data size + if "rsag" in algo_name: + nb = max(1, min(32, size // (world_size * 32))) + nt = 1024 + else: + nb = 0 + nt = 0 + + errors = {} + for accum_label, accum_dtype in accum_configs: + # Generate deterministic per-rank random uint8 values in valid e4m3b15 range + cp.random.seed(42 + rank) + raw = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8) + signs = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7 + src_uint8 = raw | signs + # Fix negative zero -> positive zero + src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8) + + # Copy into symmetric buffer + buf[:] = src_uint8 + cp.cuda.Device().synchronize() + + # Run allreduce + result = run_allreduce( + algo, + comm_group, + buf, + dtype=DataType.float8_e4m3b15, + accum_dtype=accum_dtype, + nblocks=nb, + nthreads_per_block=nt, + ) + + # Decode result + result_f32 = e4m3b15_to_float(result) + + # Compute float32 reference + ref_f32 = cp.zeros(size, dtype=cp.float32) + for r in range(world_size): + cp.random.seed(42 + r) + raw_r = cp.random.randint(0, 0x78, (size,), dtype=cp.uint8) + signs_r = cp.random.randint(0, 2, (size,), dtype=cp.uint8).astype(cp.uint8) << 7 + bits_r = raw_r | signs_r + bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r) + ref_f32 += e4m3b15_to_float(bits_r) + + # Clamp reference to e4m3b15 representable range + ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375) + + # Compute errors (only on valid entries) + valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32) + abs_err = cp.abs(result_f32[valid] - ref_f32[valid]) + mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0 + errors[accum_label] = mean_abs_err + + algo.reset() + + # Higher-precision accumulation should be at least as accurate as native + assert ( + errors["float16"] <= errors["e4m3b15_native"] + 1e-8 + ), f"float16 accum ({errors['float16']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})" + assert ( + errors["float32"] <= errors["e4m3b15_native"] + 1e-8 + ), f"float32 accum ({errors['float32']:.8f}) worse than native ({errors['e4m3b15_native']:.8f})" diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc index 99e7b031..ffa53aa8 100644 --- a/src/core/algorithm.cc +++ b/src/core/algorithm.cc @@ -41,7 +41,9 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF CommResult NativeAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr, int nBlocks, int nThreadsPerBlock, - bool symmetricMemory, const std::unordered_map& extras) { + bool symmetricMemory, const std::unordered_map& extras, + DataType accumDtype) { + if (accumDtype == DataType::AUTO) accumDtype = dtype; if (!initialized_) { initFunc_(comm); initialized_ = true; @@ -53,7 +55,7 @@ CommResult NativeAlgorithm::execute(std::shared_ptr comm, const vo contexts_[ctxKey] = ctx; } return kernelLaunchFunc_(contexts_[ctxKey], input, output, inputSize, outputSize, dtype, op, stream, nBlocks, - nThreadsPerBlock, extras); + nThreadsPerBlock, extras, accumDtype); } const std::string& NativeAlgorithm::name() const { return name_; } @@ -77,10 +79,7 @@ const CollectiveBufferMode& NativeAlgorithm::bufferMode() const { return bufferM Algorithm::Constraint NativeAlgorithm::constraint() const { return constraint_; } -void NativeAlgorithm::reset() { - contexts_.clear(); - initialized_ = false; -} +void NativeAlgorithm::reset() { contexts_.clear(); } void AlgorithmCollection::registerAlgorithm(const std::string collective, const std::string algoName, std::shared_ptr algorithm) { @@ -166,7 +165,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; } CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream, std::shared_ptr executor, int, int, bool, - const std::unordered_map&) { + const std::unordered_map&, DataType) { if (!executor) { THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute"); } @@ -192,6 +191,10 @@ CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* plan_, stream); break; #endif + case DataType::FLOAT8_E4M3B15: + executor->execute(rank, (__fp8_e4m3b15*)input, (__fp8_e4m3b15*)output, inputSize, outputSize, + DataType::FLOAT8_E4M3B15, plan_, stream); + break; case DataType::INT32: case DataType::UINT32: executor->execute(rank, (int*)input, (int*)output, inputSize, outputSize, DataType::UINT32, plan_, stream); diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu index 2d36bcf5..28ced77f 100644 --- a/src/core/executor/execution_kernel.cu +++ b/src/core/executor/execution_kernel.cu @@ -82,6 +82,12 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo case DataType::FLOAT8_E5M2: // FP8 is not supported in CUDA execution kernel. break; + case DataType::FLOAT8_E4M3B15: + // fp8_e4m3b15 is a software type not supported in the CUDA execution kernel. + break; + case DataType::AUTO: + // AUTO is a sentinel resolved before reaching this point; nothing to do. + break; } } diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index 20147c30..87b88888 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -210,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input sizeof(int4); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]); val = mscclpp::read(remoteMemory, srcOffset + idx); - tmp = cal_vector(tmp, val); + tmp = calVector(tmp, val); } output4[outputOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -353,9 +353,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = cal_vector(data, val); + data = calVector(data, val); } - data = cal_vector(data, srcPacketPayload[idx]); + data = calVector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; if constexpr (SendToRemote) { @@ -394,9 +394,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = cal_vector(data, val); + data = calVector(data, val); } - data = cal_vector(data, srcPacketPayload[idx]); + data = calVector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; PacketType* dst_val = &dstPkt[idx]; dst_val->write(data, flag_); @@ -464,7 +464,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo size_t buffOffset = (inputOffsets[index] + getOffset(outputBufferRefs[index].type, offset)) / sizeof(int4); int4 val = buff4[buffOffset + idx]; - tmp = cal_vector(tmp, val); + tmp = calVector(tmp, val); } dst4[dstOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -899,6 +899,17 @@ class ExecutionKernel { #endif break; #endif // __FP8_TYPES_EXIST__ + case DataType::FLOAT8_E4M3B15: + executionKernel<__fp8_e4m3b15, PacketType, ReuseScratch><<>>( + rank, (__fp8_e4m3b15*)src, (__fp8_e4m3b15*)dst, (__fp8_e4m3b15*)scratch, scratchOffset, scratchChunkSize, + plan, semaphores, localMemoryIdBegin, flag +#if defined(ENABLE_NPKIT) + , + NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); +#else + ); +#endif + break; case DataType::UINT8: executionKernel<<>>( rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, @@ -910,6 +921,10 @@ class ExecutionKernel { ); #endif break; + case DataType::AUTO: + // AUTO is a sentinel that must be resolved before reaching this point. + assert(false && "DataType::AUTO must be resolved before kernel launch"); + break; } } #else // !defined(MSCCLPP_DEVICE_HIP) diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp index fd9bd1e9..463f827d 100644 --- a/src/core/include/reduce_kernel.hpp +++ b/src/core/include/reduce_kernel.hpp @@ -14,7 +14,7 @@ namespace mscclpp { // Generic element-wise calculation helper template -MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) { +MSCCLPP_DEVICE_INLINE T calElements(const T& a, const T& b) { if constexpr (OpType == SUM) { return a + b; } else if constexpr (OpType == MIN) { @@ -24,56 +24,168 @@ MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) { } // Generic vector reduction helpers -template -MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) { - int4 ret; - ret.w = bit_cast(cal_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(cal_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} template -MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) { +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { uint2 ret; - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); + ret.x = bit_cast(calElements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(calElements(bit_cast(a.y), bit_cast(b.y))); return ret; } -template -MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) { - return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); +/// f32x2 specialization for uint2: uses packed f32x2 operator+ (Blackwell __fadd2_rn when available). +template <> +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { + f32x2 fa = bit_cast(a); + f32x2 fb = bit_cast(b); + f32x2 fr = fa + fb; + return bit_cast(fr); +} + +template <> +MSCCLPP_DEVICE_INLINE uint2 calVectorHelper(const uint2& a, const uint2& b) { + f32x2 fa = bit_cast(a); + f32x2 fb = bit_cast(b); + f32x2 fr = mscclpp::min(fa, fb); + return bit_cast(fr); } template -MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) { - return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + int4 ret; + ret.w = bit_cast(calElements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(calElements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(calElements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(calElements(bit_cast(a.z), bit_cast(b.z))); + return ret; } -// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper +/// f32x2 specialization for int4: process as two uint2 pairs using packed f32x2 arithmetic. +template <> +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y}; + uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w}; + uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y}; + uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w}; + uint2 lo_r = calVectorHelper(lo_a, lo_b); + uint2 hi_r = calVectorHelper(hi_a, hi_b); + return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y}; +} + +template <> +MSCCLPP_DEVICE_INLINE int4 calVectorHelper(const int4& a, const int4& b) { + uint2 lo_a = {(uint32_t)a.x, (uint32_t)a.y}; + uint2 hi_a = {(uint32_t)a.z, (uint32_t)a.w}; + uint2 lo_b = {(uint32_t)b.x, (uint32_t)b.y}; + uint2 hi_b = {(uint32_t)b.z, (uint32_t)b.w}; + uint2 lo_r = calVectorHelper(lo_a, lo_b); + uint2 hi_r = calVectorHelper(hi_a, hi_b); + return {(int)lo_r.x, (int)lo_r.y, (int)hi_r.x, (int)hi_r.y}; +} + +template +MSCCLPP_DEVICE_INLINE int calVectorHelper(const int& a, const int& b) { + return bit_cast(calElements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + return bit_cast(calElements(bit_cast(a), bit_cast(b))); +} + +/// f32x2 specialization for uint32_t: a single float packed in 32 bits (scalar fallback). +template <> +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + float fa = bit_cast(a); + float fb = bit_cast(b); + return bit_cast(fa + fb); +} + +template <> +MSCCLPP_DEVICE_INLINE uint32_t calVectorHelper(const uint32_t& a, const uint32_t& b) { + float fa = bit_cast(a); + float fb = bit_cast(b); + return bit_cast(fminf(fa, fb)); +} + +// calVector wrapper – converts scalar types to vector types and calls calVectorHelper template -MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) { +MSCCLPP_DEVICE_INLINE DataType calVector(const DataType& a, const DataType& b) { // Define the vectorized computation type based on the element type static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size"); static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes"); using CompType = typename std::conditional_t< - std::is_same_v, f16x2, + std::is_same_v, f32x2, std::conditional_t< - std::is_same_v, bf16x2, - std::conditional_t, u8x4, + std::is_same_v, f16x2, + std::conditional_t< + std::is_same_v, bf16x2, + std::conditional_t< + std::is_same_v, u8x4, + std::conditional_t, f8_e4m3b15x4, #if defined(__FP8_TYPES_EXIST__) - std::conditional_t, f8_e4m3x4, - std::conditional_t, f8_e5m2x4, -#endif - T -#if defined(__FP8_TYPES_EXIST__) - >>>>>; + std::conditional_t, f8_e4m3x4, + std::conditional_t, f8_e5m2x4, T>> #else - >>>; + T #endif - return cal_vector_helper(a, b); + >>>>>; + return calVectorHelper(a, b); +} + +/// Upcast a packed DataType (containing T elements) to a packed AccDataType (containing AccumT elements). +/// Uses the optimized to<>() specializations when available (e.g. FP8 -> float hardware intrinsics). +/// When AccumT == T, this is a no-op identity. +template +MSCCLPP_DEVICE_INLINE AccDataType upcastVector(const DataType& val) { + if constexpr (std::is_same_v) { + return val; + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + ToVec result = mscclpp::to(reinterpret_cast(val)); + return reinterpret_cast(result); + } +} + +/// Downcast a packed AccDataType (containing AccumT elements) back to DataType (containing T elements). +/// Uses the optimized to<>() specializations when available. +/// When AccumT == T, this is a no-op identity. +template +MSCCLPP_DEVICE_INLINE DataType downcastVector(const AccDataType& val) { + if constexpr (std::is_same_v) { + return val; + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + FromVec result = mscclpp::to(reinterpret_cast(val)); + return reinterpret_cast(result); + } +} + +/// Accumulate `val` (packed T elements in DataType) into `acc` (packed AccumT elements in AccDataType). +/// When AccumT == T, falls back to the standard calVector. +/// Otherwise, upcasts val to AccumT, reduces element-wise, and returns the AccumT accumulator. +template +MSCCLPP_DEVICE_INLINE AccDataType calVectorAccum(const AccDataType& acc, const DataType& val) { + if constexpr (std::is_same_v) { + return calVector(acc, val); + } else { + constexpr int nElems = sizeof(DataType) / sizeof(T); + using FromVec = VectorType; + using ToVec = VectorType; + + ToVec fv = mscclpp::to(reinterpret_cast(val)); + const ToVec& fa = reinterpret_cast(acc); + ToVec fr; +#pragma unroll + for (int i = 0; i < nElems; ++i) { + fr.data[i] = calElements(fa.data[i], fv.data[i]); + } + return reinterpret_cast(fr); + } } #endif // defined(MSCCLPP_DEVICE_COMPILE) diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 0b288b38..fb51a342 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -183,7 +183,8 @@ std::shared_ptr AllgatherFullmesh::build() { [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, [[maybe_unused]] DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) -> CommResult { + const std::unordered_map& extras, + [[maybe_unused]] DataType accumDtype) -> CommResult { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index cf6027d9..9d169d68 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -212,7 +212,8 @@ std::shared_ptr AllgatherFullmesh2::build() { [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, [[maybe_unused]] mscclpp::DataType dtype, [[maybe_unused]] ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) -> mscclpp::CommResult { + const std::unordered_map& extras, + [[maybe_unused]] mscclpp::DataType accumDtype) -> mscclpp::CommResult { return self->allgatherKernelFunc(ctx, input, output, inputSize, stream, nBlocks, nThreadsPerBlock, extras); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index 83950d7c..6cbc8977 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -47,7 +47,7 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand const int remoteRank = index < rank ? index : index + 1; LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems; uint32_t val = dstPkt[idx].read(flag, -1); - data = cal_vector(val, data); + data = calVector(val, data); } dst[idx] = data; } @@ -67,7 +67,7 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int return {(worldSize - 1) * 4, 512}; } -template +template struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, @@ -94,7 +94,8 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr comm) { CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); std::pair blockAndThreadNum{nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { @@ -105,7 +106,7 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -161,9 +162,9 @@ std::shared_ptr AllreduceAllpairPacket::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index 13c63ba1..ee46fd77 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -9,7 +9,7 @@ namespace mscclpp { namespace collective { -template +template __global__ void __launch_bounds__(512, 1) allreduceFullmesh(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, int rank, @@ -26,6 +26,10 @@ __global__ void __launch_bounds__(512, 1) int4* scratch4 = reinterpret_cast((char*)scratch); int4* resultBuff4 = reinterpret_cast(resultBuff); + // AccumVec: wider vector for mixed-precision accumulation. When AccumT==T, this is just int4 (no-op). + constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); + using AccumVec = std::conditional_t, int4, mscclpp::VectorType>; + // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` constexpr size_t unitNInt4 = 512; const size_t maxNInt4PerBlock = @@ -81,12 +85,14 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + AccumVec acc = mscclpp::upcastVector(rawData); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vector(val, data); + acc = mscclpp::calVectorAccum(acc, val); } + int4 data = mscclpp::downcastVector(acc); resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), @@ -121,12 +127,14 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + int4 rawData = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + AccumVec acc = mscclpp::upcastVector(rawData); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vector(val, data); + acc = mscclpp::calVectorAccum(acc, val); } + int4 data = mscclpp::downcastVector(acc); resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), @@ -144,7 +152,7 @@ __global__ void __launch_bounds__(512, 1) } } -template +template struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, @@ -155,7 +163,7 @@ struct AllreduceAllconnectAdapter { size_t nelems = inputSize / sizeof(T); if (nBlocks == 0) nBlocks = 35; if (nThreadsPerBlock == 0) nThreadsPerBlock = 512; - allreduceFullmesh<<>>( + allreduceFullmesh<<>>( (T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, (ChannelType*)memoryOutChannels, channelOutDataOffset, rank, nRanksPerNode, worldSize, nelems); return cudaGetLastError(); @@ -174,10 +182,10 @@ void AllreduceFullmesh::initialize(std::shared_ptr comm) { localScratchMemory_ = std::move(localMemory); } -CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, - size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, - int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceFullmesh::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); size_t recvBytes; CUdeviceptr recvBasePtr; @@ -198,7 +206,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct } inputChannelHandles = this->memoryChannelsMap_[input].second; - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", static_cast(op), static_cast(dtype)); @@ -261,9 +269,10 @@ std::shared_ptr AllreduceFullmesh::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) -> CommResult { + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu index b542a6a6..2d71cd63 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu @@ -146,7 +146,7 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct NvlsBlockPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, @@ -155,6 +155,9 @@ struct NvlsBlockPipelineAdapter { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. + return cudaErrorNotSupported; } else #if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS if constexpr (std::is_same_v || std::is_same_v) { @@ -187,9 +190,10 @@ void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr comm) CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map& extras, + DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -235,9 +239,9 @@ std::shared_ptr AllreduceNvlsBlockPipeline::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index 9824fbcd..a616485e 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -1,15 +1,17 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include + #include "allreduce/allreduce_nvls_packet.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" -#include "debug.h" +#include "logger.hpp" namespace mscclpp { namespace collective { -template +template __global__ void __launch_bounds__(1024, 1) allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output, [[maybe_unused]] mscclpp::DeviceHandle* multicast, @@ -31,15 +33,16 @@ __global__ void __launch_bounds__(1024, 1) mscclpp::SwitchChannelDeviceHandle::multimemStore(*(mscclpp::f32x2*)(&pkt), multiPkt + i); } for (uint32_t i = tid; i < nPktPerRank * worldSize; i += blockDim.x * gridDim.x) { - uint data = src[i]; + // When T == AccumT, stay with raw uint to avoid type mismatch in identity path. + using AccRaw = + std::conditional_t, uint, mscclpp::VectorType>; + AccRaw acc = mscclpp::upcastVector(src[i]); for (int peer = 0; peer < worldSize; peer++) { - if (peer == rank) { - continue; - } + if (peer == rank) continue; uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag); - data = cal_vector(data, val); + acc = mscclpp::calVectorAccum(acc, val); } - dst[i] = data; + dst[i] = mscclpp::downcastVector(acc); } __syncthreads(); if (threadIdx.x == 0) { @@ -62,13 +65,13 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize) { return {blockNum, threadNum}; } -template +template struct AllreduceNvlsPacketAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void*, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) { - allreduceNvlsPacket<<>>( + allreduceNvlsPacket<<>>( (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags, flagBufferSize); return cudaGetLastError(); @@ -78,6 +81,8 @@ struct AllreduceNvlsPacketAdapter { void AllreduceNvlsPacket::initialize(std::shared_ptr comm) { int nSwitchChannels = 1; this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels); + this->switchChannels_ = + setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); } AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { @@ -92,9 +97,7 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode(); // setup channels - int nSwitchChannels = 1; - ctx->switchChannels = - setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels); + ctx->switchChannels = this->switchChannels_; ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels); return ctx; } @@ -102,19 +105,20 @@ std::shared_ptr AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr< CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + mscclpp::DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { blockAndThreadNum = getDefaultBlockNumAndThreadNum(inputSize); } if (blockAndThreadNum.first > maxBlockNum_) { - WARN("Block number %d exceeds the maximum limit %d", blockAndThreadNum.first, maxBlockNum_); + WARN(ALGO, "Block number ", blockAndThreadNum.first, " exceeds the maximum limit ", maxBlockNum_); return CommResult::CommInvalidArgument; } - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { - WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); + WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast(dtype)); return CommResult::CommInvalidArgument; } cudaError_t error = @@ -122,7 +126,7 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error)); + WARN(ALGO, "AllreduceNvlsPacket failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; @@ -136,9 +140,10 @@ std::shared_ptr AllreduceNvlsPacket::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu index bc03ab26..3bb054da 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu @@ -109,7 +109,7 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct NvlsWarpPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, @@ -118,6 +118,9 @@ struct NvlsWarpPipelineAdapter { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. + return cudaErrorNotSupported; } else #if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS if constexpr (std::is_same_v || std::is_same_v) { @@ -147,12 +150,12 @@ void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr comm) { this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_); } -CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, - void* output, size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc( + const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, DataType dtype, + ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -198,9 +201,9 @@ std::shared_ptr AllreduceNvlsWarpPipeline::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu index f251bcda..e7f2028f 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu @@ -67,7 +67,7 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct NvlsAdapter { static cudaError_t call(const void*, void*, void*, void* memoryChannels, void*, mscclpp::DeviceHandle* nvlsChannels, @@ -77,6 +77,9 @@ struct NvlsAdapter { // uint8_t is not supported for NVLS (no hardware support for byte-level reduction) if constexpr (std::is_same_v) { return cudaErrorNotSupported; + } else if constexpr (std::is_same_v) { + // fp8_e4m3b15 is a software-only type with no hardware NVLS support. + return cudaErrorNotSupported; } else #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000) if constexpr (std::is_same_v || std::is_same_v) { @@ -114,13 +117,14 @@ void AllreduceNvls::initialize(std::shared_ptr comm) { CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + [[maybe_unused]] const std::unordered_map& extras, + mscclpp::DataType accumDtype) { if (!symmetricMemory_) { WARN("AllreduceNvls requires symmetric memory for now."); return CommResult::CommInvalidArgument; } auto ctx = std::static_pointer_cast(ctx_void); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; @@ -203,9 +207,10 @@ std::shared_ptr AllreduceNvls::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index ceb545ee..e2d8ef73 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -2,16 +2,17 @@ // Licensed under the MIT License. #include +#include #include "allreduce/allreduce_packet.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" -#include "debug.h" +#include "logger.hpp" namespace mscclpp { namespace collective { -template +template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, @@ -92,12 +93,21 @@ __global__ void __launch_bounds__(1024, 1) // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = src[idx]; - for (int index = 0; index < nPeers; index++) { - const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; - uint2 val = dstPkt[idx].read(flag); - data.x = cal_vector(val.x, data.x); - data.y = cal_vector(val.y, data.y); + { + // When T == AccumT, stay with raw uint32_t to avoid type mismatch in identity path. + using AccRaw = std::conditional_t, uint32_t, + mscclpp::VectorType>; + AccRaw accX = mscclpp::upcastVector(data.x); + AccRaw accY = mscclpp::upcastVector(data.y); + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + uint2 val = dstPkt[idx].read(flag); + accX = mscclpp::calVectorAccum(accX, val.x); + accY = mscclpp::calVectorAccum(accY, val.y); + } + data.x = mscclpp::downcastVector(accX); + data.y = mscclpp::downcastVector(accY); } dst[idx].x = data.x; @@ -142,7 +152,7 @@ __global__ void __launch_bounds__(1024, 1) #endif } -template +template struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, @@ -155,12 +165,12 @@ struct PacketAdapter { nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1); #if defined(ENABLE_NPKIT) size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; - allreducePacket<<>>( + allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else - allreducePacket<<>>( + allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif @@ -186,18 +196,22 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int } } -#if defined(__FP8_TYPES_EXIST__) // FP8-specific tuning for 32KB-256KB range - if (dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2) { - if (inputSize < (64 << 10)) { - nThreadsPerBlock = 64; - } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) { - nThreadsPerBlock = 128; - } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) { - nThreadsPerBlock = 256; + { + bool isFp8 = dtype == DataType::FLOAT8_E4M3B15; +#if defined(__FP8_TYPES_EXIST__) + isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2; +#endif + if (isFp8) { + if (inputSize < (64 << 10)) { + nThreadsPerBlock = 64; + } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) { + nThreadsPerBlock = 128; + } else if (inputSize >= (128 << 10) && inputSize <= (256 << 10)) { + nThreadsPerBlock = 256; + } } } -#endif #endif return {nBlocks, nThreadsPerBlock}; } @@ -213,7 +227,8 @@ void AllreducePacket::initialize(std::shared_ptr comm) { CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, size_t inputSize, [[maybe_unused]] DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + DataType accumDtype) { auto ctx = std::static_pointer_cast(ctx_void); std::pair blockAndThreadNum = {nBlocks, nThreadsPerBlock}; if (blockAndThreadNum.first == 0 || blockAndThreadNum.second == 0) { @@ -225,9 +240,10 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); size_t channelInOffset = (char*)input - (char*)sendBasePtr; - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { - WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast(dtype)); + WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), + ", dtype=", static_cast(dtype)); return CommResult::CommInvalidArgument; } cudaError_t error = @@ -236,7 +252,7 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { - WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error)); + WARN(ALGO, "AllreducePacket failed with error: ", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; } return CommResult::CommSuccess; @@ -280,9 +296,9 @@ std::shared_ptr AllreducePacket::build() { "default_allreduce_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) { + int nThreadsPerBlock, const std::unordered_map& extras, DataType accumDtype) { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu index d5be2257..db471b93 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag.cu @@ -87,7 +87,7 @@ __global__ void __launch_bounds__(1024, 1) int rankIdx = (rank + i + 1) % nRanksPerNode; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; int4 data = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); - tmp = cal_vector(data, tmp); + tmp = calVector(data, tmp); } for (uint32_t i = 0; i < nPeers; i++) { int rankIdx = (rank + i + 1) % nRanksPerNode; @@ -123,7 +123,7 @@ __global__ void __launch_bounds__(1024, 1) } } -template +template struct AllreduceRsAgAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, @@ -166,9 +166,9 @@ void AllreduceRsAg::initialize(std::shared_ptr comm) { CommResult AllreduceRsAg::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), ", dtype=", static_cast(dtype)); @@ -213,9 +213,10 @@ std::shared_ptr AllreduceRsAg::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) -> CommResult { + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu index a230d8cd..eabe3dc5 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_pipeline.cu @@ -168,7 +168,7 @@ __global__ void __launch_bounds__(1024, 1) uint32_t peerSlotOffset = baseOffset + remoteRankId * nInt4PerIter + threadIdInPut + putStep * blockDim.x * nblocksForPut; int4 data = scratch4[peerSlotOffset]; - tmp = cal_vector(data, tmp); + tmp = calVector(data, tmp); } storeVec(resultBuff, myChunkOffset, tmp, nelems); // Broadcast reduced result to all peers' scratch at SCATTER_AG_OFFSET + rank * nInt4PerIter @@ -220,7 +220,7 @@ __global__ void __launch_bounds__(1024, 1) } } -template +template struct AllreduceRsAgPipelineAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, @@ -274,12 +274,12 @@ void AllreduceRsAgPipeline::initialize(std::shared_ptr comm) { cudaMemcpyHostToDevice); } -CommResult AllreduceRsAgPipeline::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, - size_t inputSize, DataType dtype, ReduceOp op, - cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { +CommResult AllreduceRsAgPipeline::allreduceKernelFunc( + const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, + cudaStream_t stream, int nBlocks, int nThreadsPerBlock, + [[maybe_unused]] const std::unordered_map& extras, DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), ", dtype=", static_cast(dtype)); @@ -320,9 +320,10 @@ std::shared_ptr AllreduceRsAgPipeline::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) -> CommResult { + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu index caac07ae..f95ba7e3 100644 --- a/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_rsag_zero_copy.cu @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#include + #include "allreduce/allreduce_rsag_zero_copy.hpp" #include "allreduce/common.hpp" #include "collective_utils.hpp" @@ -36,7 +38,7 @@ __device__ mscclpp::DeviceSyncer globalSyncer; // the extra copy steps of the standard RSAG. The NRanksPerNode template // parameter enables compile-time unrolling of peer loops (supports 4 or 8). -template +template __global__ void __launch_bounds__(1024, 1) allreduceRsAgZeroCopy(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, DeviceHandle* switchChannels, void* remoteMemories, int rank, int worldSize, @@ -73,19 +75,26 @@ __global__ void __launch_bounds__(1024, 1) } __syncthreads(); int4 data[NPeers]; + // AccumInt4: when AccumT != T, use a wider accumulator type. + // For AccumT == T, this is just int4 (no-op conversion). + constexpr int nElemsPerInt4 = sizeof(int4) / sizeof(T); + // When T == AccumT, stay with raw int4 to avoid type mismatch in identity path. + using AccumVec = std::conditional_t, int4, mscclpp::VectorType>; for (uint32_t idx = threadIdx.x; idx < nInt4PerBlock; idx += blockDim.x) { uint32_t offset = idx + offset4 + rank * nInt4PerRank; if (offset >= nInt4Total) continue; - int4 tmp = buff4[offset]; + int4 tmp_raw = buff4[offset]; #pragma unroll for (int i = 0; i < NPeers; i++) { int rankIdx = (rank + i + 1) % NRanksPerNode; int peerIdx = rankIdx < rank ? rankIdx : rankIdx - 1; data[i] = mscclpp::read(((void**)remoteMemories)[peerIdx], offset); } + AccumVec acc = mscclpp::upcastVector(tmp_raw); for (int i = 0; i < NPeers; i++) { - tmp = cal_vector(data[i], tmp); + acc = mscclpp::calVectorAccum(acc, data[i]); } + int4 tmp = mscclpp::downcastVector(acc); #pragma unroll for (int i = 0; i < NPeers; i++) { int rankIdx = (rank + i + 1) % NRanksPerNode; @@ -102,7 +111,7 @@ __global__ void __launch_bounds__(1024, 1) } } -template +template struct AllreduceRsAgZeroCopyAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* remoteMemories, DeviceHandle* switchChannel, DeviceHandle*, size_t, size_t, @@ -118,11 +127,11 @@ struct AllreduceRsAgZeroCopyAdapter { } } if (nRanksPerNode == 4) { - allreduceRsAgZeroCopy<4, OpType, T> + allreduceRsAgZeroCopy<4, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); } else if (nRanksPerNode == 8) { - allreduceRsAgZeroCopy<8, OpType, T> + allreduceRsAgZeroCopy<8, OpType, T, AccumT> <<>>((T*)input, (T*)scratch, (T*)output, (ChannelType*)memoryChannels, switchChannel, remoteMemories, rank, worldSize, nelems); } else { @@ -145,9 +154,10 @@ void AllreduceRsAgZeroCopy::initialize(std::shared_ptr comm) { CommResult AllreduceRsAgZeroCopy::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map&) { + const std::unordered_map&, + DataType accumDtype) { auto algoCtx = std::static_pointer_cast(ctx); - AllreduceFunc allreduce = dispatch(op, dtype); + AllreduceFunc allreduce = dispatch(op, dtype, accumDtype); if (!allreduce) { WARN(ALGO, "Unsupported operation or data type for allreduce: op=", static_cast(op), ", dtype=", static_cast(dtype)); @@ -220,9 +230,10 @@ std::shared_ptr AllreduceRsAgZeroCopy::build() { [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras) -> CommResult { + int nThreadsPerBlock, const std::unordered_map& extras, + DataType accumDtype) -> CommResult { return self->allreduceKernelFunc(ctx, input, output, inputSize, dtype, op, stream, nBlocks, nThreadsPerBlock, - extras); + extras, accumDtype); }, [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index bd402cfa..362308b2 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -20,7 +20,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp index fa811b15..a54352b3 100644 --- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp @@ -16,7 +16,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp index 8b9b04ae..81b74add 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp @@ -19,7 +19,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp index 65a48923..fb0c63b8 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp @@ -21,7 +21,8 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, - int nThreadsPerBlock, const std::unordered_map& extras); + int nThreadsPerBlock, const std::unordered_map& extras, + mscclpp::DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, mscclpp::DataType); @@ -34,6 +35,7 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder { uintptr_t flagBuffer_; size_t flagBufferSize_; std::vector> nvlsConnections_; + std::vector switchChannels_; }; } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp index e392b54e..8f02a873 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp @@ -19,7 +19,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp index d0593500..d53ea180 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp @@ -19,7 +19,7 @@ class AllreduceNvls : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp index f0438dea..de7ca471 100644 --- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp @@ -20,7 +20,7 @@ class AllreducePacket : public AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp index 6e033f67..1fd663da 100644 --- a/src/ext/collectives/include/allreduce/allreduce_rsag.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_rsag.hpp @@ -19,7 +19,7 @@ class AllreduceRsAg : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp index 2a740ac0..7629f2fe 100644 --- a/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_pipeline.hpp @@ -19,7 +19,7 @@ class AllreduceRsAgPipeline : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp index 6153a0e4..05bf2ef3 100644 --- a/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_rsag_zero_copy.hpp @@ -18,7 +18,7 @@ class AllreduceRsAgZeroCopy : public mscclpp::AlgorithmBuilder { void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras); + const std::unordered_map& extras, DataType accumDtype); std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 9bfac69a..1e0e7e69 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -#ifndef MSCCLPP_ALLREDUCE_COMMOM_HPP_ -#define MSCCLPP_ALLREDUCE_COMMOM_HPP_ +#ifndef MSCCLPP_ALLREDUCE_COMMON_HPP_ +#define MSCCLPP_ALLREDUCE_COMMON_HPP_ #include #include @@ -77,55 +77,51 @@ using AllreduceFunc = mscclpp::DeviceHandle*, size_t, size_t, size_t, int, int, int, size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>; -template