mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
inital pipeline test
This commit is contained in:
@@ -26,14 +26,12 @@ pr:
|
||||
- '**/*.md'
|
||||
|
||||
jobs:
|
||||
- job: IntegrationTestA100
|
||||
displayName: Integration test A100
|
||||
- job: sglangtest
|
||||
displayName: SGLANG Test
|
||||
strategy:
|
||||
matrix:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
sglang:
|
||||
containerImage: lmsysorg/sglang:latest
|
||||
|
||||
pool:
|
||||
name: msccl-ci
|
||||
@@ -41,30 +39,9 @@ jobs:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
- template: templates/sglang-test.yaml
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
- job: IntegrationTestH100
|
||||
displayName: Integration test H100
|
||||
strategy:
|
||||
matrix:
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
|
||||
pool:
|
||||
name: msccl-ci-h100
|
||||
container:
|
||||
image: $(containerImage)
|
||||
|
||||
steps:
|
||||
- template: templates/integration-test.yaml
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
|
||||
gpuArch: '90'
|
||||
|
||||
@@ -13,16 +13,30 @@ parameters:
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
name: Build
|
||||
displayName: Build
|
||||
name: BuildMSCCLPP
|
||||
displayName: Build MSCCL++
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
cd ..
|
||||
pip install .
|
||||
pip install ./python/requirements_cuda12.txt
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: SGLangSetup
|
||||
displayName: SGLang Setup
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git
|
||||
cd sglang
|
||||
pip install --uprade pip
|
||||
pip install -e "python"
|
||||
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
@@ -84,152 +98,6 @@ steps:
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: SendRecvTest
|
||||
displayName: Run mscclpp SendRecv test
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp AllReduce test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllToAll
|
||||
displayName: Run mscclpp AllToAll test
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: CheckPerfNumber
|
||||
displayName: Check collective primitives performance
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: PythonAllReduceBenchmark
|
||||
displayName: Python Allreduce Benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
python3 -m pip install .; \
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: FifoPerfBenchmark
|
||||
displayName: FIFO Performance Benchmark
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"\
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
|
||||
set -e; \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
cd /root/mscclpp; \
|
||||
./build/bin/perf/fifo_test"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
|
||||
Reference in New Issue
Block a user