This commit is contained in:
empyreus
2026-03-26 22:32:09 +00:00
parent 01b7af9733
commit 99f2faced2
2 changed files with 115 additions and 57 deletions

View File

@@ -3,6 +3,8 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
@@ -10,67 +12,115 @@ parameters:
type: string
steps:
- template: deploy.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
# - task: Bash@3
# name: Build
# displayName: Build
# inputs:
# targetType: inline
# script: |
# git clone https://github.com/microsoft/mscclpp.git && cd mscclpp
# pip install .
# pip install -r ./python/requirements_cuda13.txt
# mkdir build && cd build
# cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
# make -j
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: run-remote-task.yml
parameters:
name: AllGatherTest
displayName: Run mscclpp AllGather test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- template: run-remote-task.yml
parameters:
name: SendRecvTest
displayName: Run mscclpp SendRecv test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}
- template: run-remote-task.yml
parameters:
name: AllReduceTest
displayName: Run mscclpp AllReduce test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
- template: run-remote-task.yml
parameters:
name: AllToAll
displayName: Run mscclpp AllToAll test
remoteScript: |
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: run-remote-task.yml
parameters:
name: CheckPerfNumber
displayName: Check collective primitives performance
remoteScript: |
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
- task: Bash@3
name: SGLangSetup
displayName: SGLang Setup
inputs:
targetType: inline
script: |
hostname
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
sudo docker exec -t mscclpp-sglang-test bash -c " \
python3 -m venv /root/venv && \
git clone https://github.com/microsoft/mscclpp.git && \
cd mscclpp && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j && \
cd .. && \
/root/venv/bin/pip install . && \
/root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: run-remote-task.yml
parameters:
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
remoteScript: |
python3 -m pip install .
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
- task: Bash@3
name: AllGatherTest
displayName: Run mscclpp AllGather test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-sglang-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
echo Running on \$(hostname -i); \
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: stop.yml
parameters:
subscription: ${{ parameters.subscription }}
vmssName: ${{ parameters.vmssName }}
- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp

View File

@@ -2,6 +2,14 @@ set -e
PLATFORM="${1:-cuda}"
# Create a Python venv if one is not already active
if [ -z "${VIRTUAL_ENV}" ] && [ ! -f /root/venv/bin/activate ]; then
python3 -m venv /root/venv
fi
if [ -f /root/venv/bin/activate ]; then
. /root/venv/bin/activate
fi
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys