mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
update for new remote run
This commit is contained in:
@@ -43,5 +43,4 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
sshKeySecureFile: mscclpp.pem
|
||||
gpuArch: '80'
|
||||
|
||||
@@ -3,8 +3,6 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: sshKeySecureFile
|
||||
type: string
|
||||
- name: perfBaselineFile
|
||||
type: string
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
@@ -33,36 +31,52 @@ steps:
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
|
||||
- task: Bash@3
|
||||
name: SGLangSetup
|
||||
displayName: SGLang Setup
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
hostname
|
||||
set -e
|
||||
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
: > azureuser@10.0.0.4
|
||||
tail -f azureuser@10.0.0.4 &
|
||||
CHILD_PID=$!
|
||||
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
-O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
|
||||
sudo docker exec -t mscclpp-sglang-test bash -c " \
|
||||
python3 -m venv /root/venv && \
|
||||
git clone https://github.com/microsoft/mscclpp.git && \
|
||||
cd mscclpp && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j && \
|
||||
cd .. && \
|
||||
/root/venv/bin/pip install . && \
|
||||
/root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
|
||||
"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
name: SGLangTest
|
||||
displayName: Run mscclpp SGLang test
|
||||
runRemoteArgs: '--container mscclpp-sglang-test'
|
||||
remoteScript: |
|
||||
git clone https://github.com/microsoft/mscclpp.git
|
||||
cd mscclpp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
cd ..
|
||||
pip install .
|
||||
pip install -r ./python/requirements_cuda12.txt
|
||||
|
||||
# - task: Bash@3
|
||||
# name: SGLangSetup
|
||||
# displayName: SGLang Setup
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# hostname
|
||||
# set -e
|
||||
# HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
|
||||
# SSH_OPTION="StrictHostKeyChecking=no"
|
||||
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
# : > azureuser@10.0.0.4
|
||||
# tail -f azureuser@10.0.0.4 &
|
||||
# CHILD_PID=$!
|
||||
# parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
|
||||
# -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
|
||||
# sudo docker exec -t mscclpp-sglang-test bash -c " \
|
||||
# python3 -m venv /root/venv && \
|
||||
# git clone https://github.com/microsoft/mscclpp.git && \
|
||||
# cd mscclpp && \
|
||||
# mkdir build && \
|
||||
# cd build && \
|
||||
# cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
# make -j && \
|
||||
# cd .. && \
|
||||
# /root/venv/bin/pip install . && \
|
||||
# /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
|
||||
# "'
|
||||
# kill $CHILD_PID
|
||||
# workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
- task: Bash@3
|
||||
name: AllGatherTest
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
|
||||
# --host Run command on a single host (uses parallel-ssh -H)
|
||||
# --user SSH user when using --host or custom hostfile
|
||||
# --container Docker container name to exec into (default: mscclpp-test)
|
||||
|
||||
set -e
|
||||
|
||||
@@ -23,9 +24,10 @@ USE_DOCKER=true
|
||||
USE_LOG=true
|
||||
TARGET_HOST=""
|
||||
REMOTE_USER=""
|
||||
CONTAINER_NAME="mscclpp-test"
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
|
||||
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] [--container <name>] < <command_script>" >&2
|
||||
}
|
||||
|
||||
require_value() {
|
||||
@@ -56,6 +58,11 @@ while [[ "$1" == --* ]]; do
|
||||
REMOTE_USER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--container)
|
||||
require_value "--container" "${2-}"
|
||||
CONTAINER_NAME="$2"
|
||||
shift 2
|
||||
;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
@@ -93,6 +100,13 @@ PSSH_COMMON=(
|
||||
)
|
||||
|
||||
if $USE_DOCKER; then
|
||||
# If using the sglang container, launch it first
|
||||
if [ "${CONTAINER_NAME}" = "mscclpp-sglang-test" ]; then
|
||||
parallel-ssh -i "${PSSH_COMMON[@]}" \
|
||||
"sudo docker rm -f ${CONTAINER_NAME} 2>/dev/null; \
|
||||
sudo docker run -itd --name=${CONTAINER_NAME} --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash"
|
||||
fi
|
||||
|
||||
INNER="set -euxo pipefail;"
|
||||
INNER+=" cd /root/mscclpp;"
|
||||
INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
|
||||
@@ -100,7 +114,7 @@ if $USE_DOCKER; then
|
||||
INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
|
||||
|
||||
parallel-ssh -i "${PSSH_COMMON[@]}" \
|
||||
"sudo docker exec mscclpp-test bash -c \"${INNER}\""
|
||||
"sudo docker exec ${CONTAINER_NAME} bash -c \"${INNER}\""
|
||||
else
|
||||
parallel-ssh -i "${PSSH_COMMON[@]}" \
|
||||
"set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"
|
||||
|
||||
Reference in New Issue
Block a user