From 99f2faced2c6dc0e54e53e18d5f7ea8a12554dd9 Mon Sep 17 00:00:00 2001 From: empyreus Date: Thu, 26 Mar 2026 22:32:09 +0000 Subject: [PATCH] fixes --- .azure-pipelines/templates/sglang-test.yaml | 164 +++++++++++++------- test/deploy/setup.sh | 8 + 2 files changed, 115 insertions(+), 57 deletions(-) diff --git a/.azure-pipelines/templates/sglang-test.yaml b/.azure-pipelines/templates/sglang-test.yaml index b686e4f2..a093e3e9 100644 --- a/.azure-pipelines/templates/sglang-test.yaml +++ b/.azure-pipelines/templates/sglang-test.yaml @@ -3,6 +3,8 @@ parameters: type: string - name: vmssName type: string +- name: sshKeySecureFile + type: string - name: perfBaselineFile type: string default: 'test/deploy/perf_ndmv4.jsonl' @@ -10,67 +12,115 @@ parameters: type: string steps: -- template: deploy.yml - parameters: - subscription: ${{ parameters.subscription }} - vmssName: ${{ parameters.vmssName }} - gpuArch: ${{ parameters.gpuArch }} - deployArgs: 'single-node-test' +# - task: Bash@3 +# name: Build +# displayName: Build +# inputs: +# targetType: inline +# script: | +# git clone https://github.com/microsoft/mscclpp.git && cd mscclpp +# pip install . +# pip install -r ./python/requirements_cuda13.txt +# mkdir build && cd build +# cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. +# make -j +# workingDirectory: '$(System.DefaultWorkingDirectory)' -- template: run-remote-task.yml - parameters: - name: AllGatherTest - displayName: Run mscclpp AllGather test - remoteScript: | - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: inline + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash -- template: run-remote-task.yml - parameters: - name: SendRecvTest - displayName: Run mscclpp SendRecv test - remoteScript: | - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: ${{ parameters.sshKeySecureFile }} -- template: run-remote-task.yml - parameters: - name: AllReduceTest - displayName: Run mscclpp AllReduce test - remoteScript: | - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp -- template: run-remote-task.yml - parameters: - name: AllToAll - displayName: Run mscclpp AllToAll test - remoteScript: | - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl - mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: "single-node-test" + workingDirectory: '$(System.DefaultWorkingDirectory)' -- template: run-remote-task.yml - parameters: - name: CheckPerfNumber - displayName: Check collective primitives performance - remoteScript: | - python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }} +- task: Bash@3 + name: SGLangSetup + displayName: SGLang Setup + inputs: + targetType: inline + script: | + hostname + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \ + sudo docker exec -t mscclpp-sglang-test bash -c " \ + python3 -m venv /root/venv && \ + git clone https://github.com/microsoft/mscclpp.git && \ + cd mscclpp && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release .. && \ + make -j && \ + cd .. && \ + /root/venv/bin/pip install . && \ + /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \ + "' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' -- template: run-remote-task.yml - parameters: - name: PythonAllReduceBenchmark - displayName: Python Allreduce Benchmark - remoteScript: | - python3 -m pip install . - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py +- task: Bash@3 + name: AllGatherTest + displayName: Run mscclpp AllGather test + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-sglang-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + echo Running on \$(hostname -i); \ + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' -- template: stop.yml - parameters: - subscription: ${{ parameters.subscription }} - vmssName: ${{ parameters.vmssName }} \ No newline at end of file + +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 80cd10b1..4a16fdc3 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -2,6 +2,14 @@ set -e PLATFORM="${1:-cuda}" +# Create a Python venv if one is not already active +if [ -z "${VIRTUAL_ENV}" ] && [ ! -f /root/venv/bin/activate ]; then + python3 -m venv /root/venv +fi +if [ -f /root/venv/bin/activate ]; then + . /root/venv/bin/activate +fi + mkdir -p /root/.ssh mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys chown root:root /root/.ssh/authorized_keys