diff --git a/.azure-pipelines/templates/sglang-test.yaml b/.azure-pipelines/templates/sglang-test.yaml index a093e3e9..b686e4f2 100644 --- a/.azure-pipelines/templates/sglang-test.yaml +++ b/.azure-pipelines/templates/sglang-test.yaml @@ -3,8 +3,6 @@ parameters: type: string - name: vmssName type: string -- name: sshKeySecureFile - type: string - name: perfBaselineFile type: string default: 'test/deploy/perf_ndmv4.jsonl' @@ -12,115 +10,67 @@ parameters: type: string steps: -# - task: Bash@3 -# name: Build -# displayName: Build -# inputs: -# targetType: inline -# script: | -# git clone https://github.com/microsoft/mscclpp.git && cd mscclpp -# pip install . -# pip install -r ./python/requirements_cuda13.txt -# mkdir build && cd build -# cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. -# make -j -# workingDirectory: '$(System.DefaultWorkingDirectory)' +- template: deploy.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} + gpuArch: ${{ parameters.gpuArch }} + deployArgs: 'single-node-test' -- task: Bash@3 - name: InstallPackages - displayName: Install Packages - inputs: - targetType: inline - script: | - sudo apt-get update -y - sudo apt-get install pssh -y - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +- template: run-remote-task.yml + parameters: + name: AllGatherTest + displayName: Run mscclpp AllGather test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl -- task: DownloadSecureFile@1 - name: SshKeyFile - displayName: Download key file - inputs: - secureFile: ${{ parameters.sshKeySecureFile }} +- template: run-remote-task.yml + parameters: + name: SendRecvTest + displayName: Run mscclpp SendRecv test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl -- task: AzureCLI@2 - name: StartVMSS - displayName: Start VMSS - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp +- template: run-remote-task.yml + parameters: + name: AllReduceTest + displayName: Run mscclpp AllReduce test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl -- task: Bash@3 - name: DeployTestEnv - displayName: Deploy Test Env - inputs: - targetType: filePath - filePath: test/deploy/deploy.sh - arguments: "single-node-test" - workingDirectory: '$(System.DefaultWorkingDirectory)' +- template: run-remote-task.yml + parameters: + name: AllToAll + displayName: Run mscclpp AllToAll test + remoteScript: | + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl -- task: Bash@3 - name: SGLangSetup - displayName: SGLang Setup - inputs: - targetType: inline - script: | - hostname - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \ - sudo docker exec -t mscclpp-sglang-test bash -c " \ - python3 -m venv /root/venv && \ - git clone https://github.com/microsoft/mscclpp.git && \ - cd mscclpp && \ - mkdir build && \ - cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j && \ - cd .. && \ - /root/venv/bin/pip install . && \ - /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \ - "' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' +- template: run-remote-task.yml + parameters: + name: CheckPerfNumber + displayName: Check collective primitives performance + remoteScript: | + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }} -- task: Bash@3 - name: AllGatherTest - displayName: Run mscclpp AllGather test - inputs: - targetType: inline - script: | - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker exec -t mscclpp-sglang-test bash -c " \ - export PATH=/usr/local/mpi/bin:\$PATH; \ - export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ - echo Running on \$(hostname -i); \ - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' +- template: run-remote-task.yml + parameters: + name: PythonAllReduceBenchmark + displayName: Python Allreduce Benchmark + remoteScript: | + python3 -m pip install . + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py - -- task: AzureCLI@2 - name: StopVMSS - displayName: Deallocate VMSS - condition: always() - inputs: - azureSubscription: ${{ parameters.subscription }} - scriptType: bash - scriptLocation: inlineScript - inlineScript: | - az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file +- template: stop.yml + parameters: + subscription: ${{ parameters.subscription }} + vmssName: ${{ parameters.vmssName }} \ No newline at end of file