mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
fixes
This commit is contained in:
@@ -3,6 +3,8 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
@@ -10,67 +12,115 @@ parameters:
|
||||
type: string
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
# - task: Bash@3
|
||||
# name: Build
|
||||
# displayName: Build
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# git clone https://github.com/microsoft/mscclpp.git && cd mscclpp
|
||||
# pip install .
|
||||
# pip install -r ./python/requirements_cuda13.txt
|
||||
# mkdir build && cd build
|
||||
# cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
# make -j
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllGatherTest
|
||||
displayName: Run mscclpp AllGather test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
- task: Bash@3
|
||||
name: InstallPackages
|
||||
displayName: Install Packages
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install pssh -y
|
||||
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: SendRecvTest
|
||||
displayName: Run mscclpp SendRecv test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
- task: DownloadSecureFile@1
|
||||
name: SshKeyFile
|
||||
displayName: Download key file
|
||||
inputs:
|
||||
secureFile: ${{ parameters.sshKeySecureFile }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllReduceTest
|
||||
displayName: Run mscclpp AllReduce test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
|
||||
- task: AzureCLI@2
|
||||
name: StartVMSS
|
||||
displayName: Start VMSS
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: AllToAll
|
||||
displayName: Run mscclpp AllToAll test
|
||||
remoteScript: |
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
- task: Bash@3
|
||||
name: DeployTestEnv
|
||||
displayName: Deploy Test Env
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: "single-node-test"
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: CheckPerfNumber
|
||||
displayName: Check collective primitives performance
|
||||
remoteScript: |
|
||||
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file ${{ parameters.perfBaselineFile }}
|
||||
- task: Bash@3
|
||||
name: SGLangSetup
|
||||
displayName: SGLang Setup
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
hostname
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
|
||||
sudo docker exec -t mscclpp-sglang-test bash -c " \
|
||||
python3 -m venv /root/venv && \
|
||||
git clone https://github.com/microsoft/mscclpp.git && \
|
||||
cd mscclpp && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j && \
|
||||
cd .. && \
|
||||
/root/venv/bin/pip install . && \
|
||||
/root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
|
||||
"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: PythonAllReduceBenchmark
|
||||
displayName: Python Allreduce Benchmark
|
||||
remoteScript: |
|
||||
python3 -m pip install .
|
||||
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
- task: Bash@3
|
||||
name: AllGatherTest
|
||||
displayName: Run mscclpp AllGather test
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker exec -t mscclpp-sglang-test bash -c " \
|
||||
export PATH=/usr/local/mpi/bin:\$PATH; \
|
||||
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
|
||||
echo Running on \$(hostname -i); \
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- template: stop.yml
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
|
||||
- task: AzureCLI@2
|
||||
name: StopVMSS
|
||||
displayName: Deallocate VMSS
|
||||
condition: always()
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.subscription }}
|
||||
scriptType: bash
|
||||
scriptLocation: inlineScript
|
||||
inlineScript: |
|
||||
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
|
||||
@@ -2,6 +2,14 @@ set -e
|
||||
|
||||
PLATFORM="${1:-cuda}"
|
||||
|
||||
# Create a Python venv if one is not already active
|
||||
if [ -z "${VIRTUAL_ENV}" ] && [ ! -f /root/venv/bin/activate ]; then
|
||||
python3 -m venv /root/venv
|
||||
fi
|
||||
if [ -f /root/venv/bin/activate ]; then
|
||||
. /root/venv/bin/activate
|
||||
fi
|
||||
|
||||
mkdir -p /root/.ssh
|
||||
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
||||
chown root:root /root/.ssh/authorized_keys
|
||||
|
||||
Reference in New Issue
Block a user