mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Now build images by a script with a shared Dockerfile template --------- Co-authored-by: Binyang Li <binyli@microsoft.com> Co-authored-by: Saeed Maleki <saemal@microsoft.com>
70 lines
3.7 KiB
YAML
70 lines
3.7 KiB
YAML
name: IntegrationTest
|
|
|
|
on: workflow_dispatch
|
|
|
|
jobs:
|
|
IntegrationTest:
|
|
runs-on: [ self-hosted, A100 ]
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
strategy:
|
|
matrix:
|
|
cuda: [ cuda11.8, cuda12.2 ]
|
|
|
|
container:
|
|
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
|
|
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
|
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Build
|
|
run: |
|
|
mkdir build && cd build
|
|
cmake -DCMAKE_BUILD_TYPE=Release ..
|
|
make -j
|
|
|
|
- name: Lock GPU clock frequency
|
|
run: |
|
|
sudo nvidia-smi -pm 1
|
|
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
|
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
|
done
|
|
|
|
- name: Run mscclpp AllGather test
|
|
run: |
|
|
set -e
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
|
|
|
- name: Run mscclpp SendRecv test
|
|
run: |
|
|
set -e
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
|
|
|
- name: Run mscclpp AllReduce test
|
|
run: |
|
|
set -e
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
|
|
|
- name: Run mscclpp AllToAll test
|
|
run: |
|
|
set -e
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
|
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
|
|
|
- name: Check collective primitives performance
|
|
run: |
|
|
set -e
|
|
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
|