mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
updates
This commit is contained in:
@@ -43,7 +43,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yaml
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yaml
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/codecov.yaml
|
||||
- template: templates/codecov.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
|
||||
@@ -41,7 +41,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
@@ -60,7 +60,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
- template: templates/integration-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
|
||||
@@ -50,13 +50,13 @@ jobs:
|
||||
echo "Entry already exists, nothing to do."
|
||||
fi
|
||||
|
||||
- template: templates/deploy.yaml
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
resourceGroup: msccl-IT
|
||||
|
||||
- template: templates/run-remote-task.yaml
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
|
||||
|
||||
- template: templates/run-remote-task.yaml
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
|
||||
|
||||
- template: templates/run-remote-task.yaml
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh pytests
|
||||
|
||||
- template: templates/run-remote-task.yaml
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
|
||||
|
||||
- template: templates/stop.yaml
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/nccl-test.yaml
|
||||
- template: templates/nccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
@@ -60,7 +60,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/nccl-test.yaml
|
||||
- template: templates/nccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/rccl-test.yaml
|
||||
- template: templates/rccl-test.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
|
||||
@@ -10,7 +10,7 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -22,7 +22,7 @@ steps:
|
||||
buildName: BuildCoverage
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: TestsCoverageNonPerf
|
||||
displayName: Run unit_tests + mp_unit_tests (non-perf) with coverage
|
||||
@@ -44,7 +44,7 @@ steps:
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests --exclude-perf-tests
|
||||
echo "mp_unit_tests -np 4: PASSED"
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: CaptureCoverage
|
||||
displayName: Capture coverage data with lcov
|
||||
@@ -104,7 +104,7 @@ steps:
|
||||
./codecov upload-process --disable-search -t $(CODECOV_TOKEN) -f coverage.info --flag ${{ parameters.platform }}-${{ parameters.gpuArch }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -10,14 +10,14 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllGatherTest
|
||||
displayName: Run mscclpp AllGather test
|
||||
@@ -27,14 +27,14 @@ steps:
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: SendRecvTest
|
||||
displayName: Run mscclpp SendRecv test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp AllReduce test
|
||||
@@ -47,7 +47,7 @@ steps:
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllToAll
|
||||
displayName: Run mscclpp AllToAll test
|
||||
@@ -55,14 +55,14 @@ steps:
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: CheckPerfNumber
|
||||
displayName: Check collective primitives performance
|
||||
remoteScript: |
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PythonAllReduceBenchmark
|
||||
displayName: Python Allreduce Benchmark
|
||||
@@ -70,7 +70,7 @@ steps:
|
||||
python3 -m pip install .
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,4 +1,4 @@
|
||||
# .azure-pipelines/templates/nccl-test.yaml
|
||||
# .azure-pipelines/templates/nccl-test.yml
|
||||
# ----------------------------------------
|
||||
# A step‐template that runs the entire MSCCLPP→NCCL test suite on one pool/container.
|
||||
#
|
||||
@@ -15,13 +15,13 @@ parameters:
|
||||
default: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
deployArgs: 'nccltest-single-node'
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallNcclTests
|
||||
displayName: Install NCCL Tests
|
||||
@@ -31,7 +31,7 @@ steps:
|
||||
cd nccl-tests
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallNccl
|
||||
displayName: Install NCCL
|
||||
@@ -46,7 +46,7 @@ steps:
|
||||
cd nccl
|
||||
make -j src.build NVCC_GENCODE=${{ parameters.nvccGencode }}
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclAllGatherFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllGather Test with or without Fallback to NCCL operation
|
||||
@@ -54,7 +54,7 @@ steps:
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclAllReduceFallbaclkToNcclTest
|
||||
displayName: Run NCCL AllReduce Test with or without Fallback to NCCL operation
|
||||
@@ -62,7 +62,7 @@ steps:
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allgather" /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunNcclBroadcastFallbaclkToNcclTest
|
||||
displayName: Run NCCL Broadcast Test with or without Fallback to NCCL operation
|
||||
@@ -70,7 +70,7 @@ steps:
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=/root/nccl/build/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="allreduce" /root/nccl-tests/build/broadcast_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -1,4 +1,4 @@
|
||||
# .azure-pipelines/templates/rccl-test.yaml
|
||||
# .azure-pipelines/templates/rccl-test.yml
|
||||
# ------------------------------------------------
|
||||
# A step-template that runs the entire MSCCLPP→RCCL test suite on one pool/container.
|
||||
#
|
||||
@@ -17,7 +17,7 @@ parameters:
|
||||
default: "gfx942"
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -27,7 +27,7 @@ steps:
|
||||
deployArgs: 'single-node-test true rocm'
|
||||
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: InstallRcclTests
|
||||
displayName: Install RCCL Tests
|
||||
@@ -41,7 +41,7 @@ steps:
|
||||
cd projects/rccl-tests
|
||||
MPI=1 MPI_HOME=/usr/local/mpi make -j
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunRcclAllGatherTest
|
||||
displayName: Run RCCL AllGather Test with or without MSCCLPP Lib
|
||||
@@ -49,7 +49,7 @@ steps:
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_gather_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: RunRcclAllReduceTest
|
||||
displayName: Run RCCL AllReduce Test with or without MSCCLPP Lib
|
||||
@@ -57,7 +57,7 @@ steps:
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/lib/libmscclpp_nccl.so -x MSCCLPP_NCCL_SYMMETRIC_MEMORY=1 -x NCCL_DEBUG=WARN /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
mpirun -np 8 --bind-to numa --allow-run-as-root /root/rocm-systems/projects/rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -7,7 +7,7 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -15,14 +15,14 @@ steps:
|
||||
cmakeArgs: '-DMSCCLPP_USE_IB=OFF'
|
||||
deployArgs: 'single-node-test false'
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
remoteScript: |
|
||||
./build/bin/unit_tests
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
@@ -31,14 +31,14 @@ steps:
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: StopContainer
|
||||
displayName: Stop existing container
|
||||
@@ -82,14 +82,14 @@ steps:
|
||||
arguments: single-node-test false
|
||||
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTestsWithIbBuildDisableIb
|
||||
displayName: Run pytests (IB build, IB tests disabled)
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -8,7 +8,7 @@ parameters:
|
||||
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
cmakeArgs: '-DMSCCLPP_NPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT"'
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
@@ -30,7 +30,7 @@ steps:
|
||||
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
@@ -51,7 +51,7 @@ steps:
|
||||
grep -q NPKIT_EVENT_EXECUTOR_PUT_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
grep -q NPKIT_EVENT_EXECUTOR_REDUCE_SEND_PACKET_ENTRY ./npkit_output/npkit_event_trace.json
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -10,7 +10,7 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yaml
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -19,14 +19,14 @@ steps:
|
||||
deployArgs: 'single-node-test true ${{ parameters.platform }}'
|
||||
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: UnitTests
|
||||
displayName: Run mscclpp unit tests
|
||||
remoteScript: |
|
||||
./build/bin/unit_tests
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: MpUnitTests
|
||||
displayName: Run mscclpp multi-process unit tests
|
||||
@@ -35,14 +35,14 @@ steps:
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests
|
||||
|
||||
- template: run-remote-task.yaml
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PyTests
|
||||
displayName: Run pytests
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
|
||||
|
||||
- template: stop.yaml
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-npkit.yaml
|
||||
- template: templates/ut-npkit.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-npkit.yaml
|
||||
- template: templates/ut-npkit.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
@@ -123,7 +123,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut-no-ib-env.yaml
|
||||
- template: templates/ut-no-ib-env.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
@@ -142,7 +142,7 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/ut.yaml
|
||||
- template: templates/ut.yml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-mi300x
|
||||
vmssName: mscclpp-mi300x-ci
|
||||
|
||||
12
README.md
12
README.md
@@ -3,16 +3,16 @@
|
||||
[](https://github.com/microsoft/mscclpp/releases/latest)
|
||||
[](LICENSE)
|
||||
[](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
|
||||
[](https://microsoft.github.io/mscclpp/)
|
||||
[](https://microsoft.github.io/mscclpp/)
|
||||
[](https://codecov.io/gh/microsoft/mscclpp)
|
||||
|
||||
| Testing Pipelines | Build Status |
|
||||
|--------------------------|-------------------|
|
||||
| Unit Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Integration Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
|
||||
| Unit Tests (ROCm) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) |
|
||||
| NCCL Tests | [](https://dev.azure.com/msazure/One/_build/latest?definitionId=320665&branchName=main) |
|
||||
| RCCL Tests | [](https://dev.azure.com/msazure/One/_build/latest?definitionId=448013&branchName=main) |
|
||||
| Unit Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Unit Tests (ROCm) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) |
|
||||
| Integration Tests (CUDA) | [](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) |
|
||||
| NCCL Tests | [)](https://msazure.visualstudio.com/One/_build/latest?definitionId=320665&repoName=microsoft%2Fmscclpp&branchName=main) |
|
||||
| RCCL Tests | [)](https://msazure.visualstudio.com/One/_build/latest?definitionId=448013&branchName=main) |
|
||||
|
||||
A GPU-driven communication stack for scalable AI applications.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user