update for new remote run

This commit is contained in:
empyreus
2026-03-26 23:51:07 +00:00
parent 0a6d329bb8
commit fa30289415
3 changed files with 62 additions and 35 deletions

View File

@@ -43,5 +43,4 @@ jobs:
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'

View File

@@ -3,8 +3,6 @@ parameters:
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
@@ -33,36 +31,52 @@ steps:
gpuArch: ${{ parameters.gpuArch }}
deployArgs: 'single-node-test'
- task: Bash@3
name: SGLangSetup
displayName: SGLang Setup
inputs:
targetType: inline
script: |
hostname
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
sudo docker exec -t mscclpp-sglang-test bash -c " \
python3 -m venv /root/venv && \
git clone https://github.com/microsoft/mscclpp.git && \
cd mscclpp && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j && \
cd .. && \
/root/venv/bin/pip install . && \
/root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'
- template: run-remote-task.yml
parameters:
name: SGLangTest
displayName: Run mscclpp SGLang test
runRemoteArgs: '--container mscclpp-sglang-test'
remoteScript: |
git clone https://github.com/microsoft/mscclpp.git
cd mscclpp
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
cd ..
pip install .
pip install -r ./python/requirements_cuda12.txt
# - task: Bash@3
# name: SGLangSetup
# displayName: SGLang Setup
# inputs:
# targetType: inline
# script: |
# hostname
# set -e
# HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
# SSH_OPTION="StrictHostKeyChecking=no"
# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
# : > azureuser@10.0.0.4
# tail -f azureuser@10.0.0.4 &
# CHILD_PID=$!
# parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
# -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \
# sudo docker exec -t mscclpp-sglang-test bash -c " \
# python3 -m venv /root/venv && \
# git clone https://github.com/microsoft/mscclpp.git && \
# cd mscclpp && \
# mkdir build && \
# cd build && \
# cmake -DCMAKE_BUILD_TYPE=Release .. && \
# make -j && \
# cd .. && \
# /root/venv/bin/pip install . && \
# /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \
# "'
# kill $CHILD_PID
# workingDirectory: '$(System.DefaultWorkingDirectory)'
- task: Bash@3
name: AllGatherTest

View File

@@ -11,6 +11,7 @@
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
# --host Run command on a single host (uses parallel-ssh -H)
# --user SSH user when using --host or custom hostfile
# --container Docker container name to exec into (default: mscclpp-test)
set -e
@@ -23,9 +24,10 @@ USE_DOCKER=true
USE_LOG=true
TARGET_HOST=""
REMOTE_USER=""
CONTAINER_NAME="mscclpp-test"
usage() {
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] [--container <name>] < <command_script>" >&2
}
require_value() {
@@ -56,6 +58,11 @@ while [[ "$1" == --* ]]; do
REMOTE_USER="$2"
shift 2
;;
--container)
require_value "--container" "${2-}"
CONTAINER_NAME="$2"
shift 2
;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
@@ -93,6 +100,13 @@ PSSH_COMMON=(
)
if $USE_DOCKER; then
# If using the sglang container, launch it first
if [ "${CONTAINER_NAME}" = "mscclpp-sglang-test" ]; then
parallel-ssh -i "${PSSH_COMMON[@]}" \
"sudo docker rm -f ${CONTAINER_NAME} 2>/dev/null; \
sudo docker run -itd --name=${CONTAINER_NAME} --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash"
fi
INNER="set -euxo pipefail;"
INNER+=" cd /root/mscclpp;"
INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
@@ -100,7 +114,7 @@ if $USE_DOCKER; then
INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail"
parallel-ssh -i "${PSSH_COMMON[@]}" \
"sudo docker exec mscclpp-test bash -c \"${INNER}\""
"sudo docker exec ${CONTAINER_NAME} bash -c \"${INNER}\""
else
parallel-ssh -i "${PSSH_COMMON[@]}" \
"set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"