mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 17:55:52 +00:00
- Add new algos (allreduce_rsag, allreduce_rsag_pipeline and
allreduce_rsag_zero_copy) for GB200.
- Add IB stub for non-IB env
- Provides example for algorithm tunning with different nblocks/nthreads
Perf for allreduce_rsag
```
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1048576 262144 float sum -1 25.16 41.67 62.51 0 23.73 44.18 66.27 0
2097152 524288 float sum -1 26.06 80.47 120.71 0 25.31 82.86 124.29 0
4194304 1048576 float sum -1 31.09 134.93 202.39 0 30.75 136.39 204.58 0
8388608 2097152 float sum -1 45.52 184.29 276.43 0 45.13 185.87 278.80 0
16777216 4194304 float sum -1 75.73 221.53 332.30 0 75.51 222.18 333.27 0
33554432 8388608 float sum -1 137.25 244.48 366.72 0 137.22 244.54 366.81 0
67108864 16777216 float sum -1 271.34 247.32 370.99 0 270.86 247.76 371.65 0
134217728 33554432 float sum -1 534.25 251.22 376.84 0 534.43 251.14 376.71 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 264.454
#
# Collective test concluded: all_reduce_perf
```
perf for allreduce_rsag_pipeline
```
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1048576 262144 float sum -1 61.57 17.03 25.55 0 61.51 17.05 25.57 0
2097152 524288 float sum -1 61.31 34.20 51.31 0 61.23 34.25 51.38 0
4194304 1048576 float sum -1 61.62 68.06 102.10 0 61.84 67.83 101.74 0
8388608 2097152 float sum -1 61.97 135.37 203.06 0 61.89 135.53 203.30 0
16777216 4194304 float sum -1 63.15 265.65 398.48 0 62.89 266.76 400.15 0
33554432 8388608 float sum -1 100.63 333.46 500.19 0 99.76 336.34 504.51 0
67108864 16777216 float sum -1 180.04 372.75 559.13 0 179.75 373.34 560.01 0
134217728 33554432 float sum -1 339.60 395.23 592.84 0 338.16 396.91 595.36 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 304.665
#
# Collective test concluded: all_reduce_perf
```
perf for allreduce_rsag_zero_copy
```
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1048576 262144 float sum -1 14.99 69.93 104.90 0 14.44 72.61 108.92 0
2097152 524288 float sum -1 16.19 129.56 194.33 0 15.85 132.32 198.48 0
4194304 1048576 float sum -1 21.19 197.98 296.97 0 20.64 203.20 304.81 0
8388608 2097152 float sum -1 31.04 270.27 405.41 0 30.68 273.44 410.16 0
16777216 4194304 float sum -1 50.34 333.26 499.89 0 50.15 334.51 501.77 0
33554432 8388608 float sum -1 89.58 374.56 561.84 0 88.65 378.48 567.73 0
67108864 16777216 float sum -1 165.69 405.03 607.54 0 163.64 410.10 615.16 0
134217728 33554432 float sum -1 323.19 415.28 622.93 0 318.01 422.05 633.07 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 414.619
#
# Collective test concluded: all_reduce_perf
```
---------
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com>
Co-authored-by: Qinghua Zhou <qinghuazhou@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
117 lines
2.6 KiB
YAML
117 lines
2.6 KiB
YAML
name: "CodeQL"
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
- release/*
|
|
paths:
|
|
- 'cmake/**'
|
|
- 'src/**'
|
|
- 'include/**'
|
|
- 'CMakeLists.txt'
|
|
- '.github/workflows/codeql-analysis.yml'
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
- release/*
|
|
paths:
|
|
- 'cmake/**'
|
|
- 'src/**'
|
|
- 'include/**'
|
|
- 'CMakeLists.txt'
|
|
- '.github/workflows/codeql-analysis.yml'
|
|
schedule:
|
|
- cron: "30 1 * * 1"
|
|
|
|
jobs:
|
|
analyze-cuda:
|
|
name: Analyze (CUDA)
|
|
runs-on: 'ubuntu-latest'
|
|
container:
|
|
image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
|
|
|
|
permissions:
|
|
actions: read
|
|
contents: read
|
|
security-events: write
|
|
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
language: [ 'cpp', 'python' ]
|
|
version: [ 'cuda11.8', 'cuda12.8' ]
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Check disk space
|
|
run: |
|
|
df -h
|
|
|
|
- name: Initialize CodeQL
|
|
uses: github/codeql-action/init@v4
|
|
with:
|
|
languages: ${{ matrix.language }}
|
|
|
|
- name: Dubious ownership exception
|
|
run: |
|
|
git config --global --add safe.directory /__w/mscclpp/mscclpp
|
|
|
|
- name: Build
|
|
run: |
|
|
rm -rf build && mkdir build && cd build
|
|
cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
|
|
make -j4
|
|
|
|
- name: Perform CodeQL Analysis
|
|
uses: github/codeql-action/analyze@v4
|
|
with:
|
|
category: "/language:${{matrix.language}}/version:${{matrix.version}}"
|
|
|
|
analyze-rocm:
|
|
name: Analyze (ROCm)
|
|
runs-on: 'ubuntu-latest'
|
|
container:
|
|
image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
|
|
|
|
permissions:
|
|
actions: read
|
|
contents: read
|
|
security-events: write
|
|
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
language: [ 'cpp', 'python' ]
|
|
version: [ 'rocm6.2' ]
|
|
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Check disk space
|
|
run: |
|
|
df -h
|
|
|
|
- name: Initialize CodeQL
|
|
uses: github/codeql-action/init@v4
|
|
with:
|
|
languages: ${{ matrix.language }}
|
|
|
|
- name: Dubious ownership exception
|
|
run: |
|
|
git config --global --add safe.directory /__w/mscclpp/mscclpp
|
|
|
|
- name: Build
|
|
run: |
|
|
rm -rf build && mkdir build && cd build
|
|
CXX=/opt/rocm/bin/hipcc cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON ..
|
|
make -j4
|
|
|
|
- name: Perform CodeQL Analysis
|
|
uses: github/codeql-action/analyze@v4
|
|
with:
|
|
category: "/language:${{matrix.language}}/version:${{matrix.version}}"
|