diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index a01677cd..fdbf93df 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -43,5 +43,4 @@ jobs: parameters: subscription: mscclpp-ci vmssName: mscclpp-ci - sshKeySecureFile: mscclpp.pem gpuArch: '80' diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml index e201718f..f3932029 100644 --- a/.azure-pipelines/templates/sglang-test.yml +++ b/.azure-pipelines/templates/sglang-test.yml @@ -3,8 +3,6 @@ parameters: type: string - name: vmssName type: string -- name: sshKeySecureFile - type: string - name: perfBaselineFile type: string default: 'test/deploy/perf_ndmv4.jsonl' @@ -33,36 +31,52 @@ steps: gpuArch: ${{ parameters.gpuArch }} deployArgs: 'single-node-test' -- task: Bash@3 - name: SGLangSetup - displayName: SGLang Setup - inputs: - targetType: inline - script: | - hostname - set -e - HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci - SSH_OPTION="StrictHostKeyChecking=no" - KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} - : > azureuser@10.0.0.4 - tail -f azureuser@10.0.0.4 & - CHILD_PID=$! - parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \ - sudo docker exec -t mscclpp-sglang-test bash -c " \ - python3 -m venv /root/venv && \ - git clone https://github.com/microsoft/mscclpp.git && \ - cd mscclpp && \ - mkdir build && \ - cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j && \ - cd .. && \ - /root/venv/bin/pip install . && \ - /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \ - "' - kill $CHILD_PID - workingDirectory: '$(System.DefaultWorkingDirectory)' +- template: run-remote-task.yml + parameters: + name: SGLangTest + displayName: Run mscclpp SGLang test + runRemoteArgs: '--container mscclpp-sglang-test' + remoteScript: | + git clone https://github.com/microsoft/mscclpp.git + cd mscclpp + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + cd .. + pip install . + pip install -r ./python/requirements_cuda12.txt + +# - task: Bash@3 +# name: SGLangSetup +# displayName: SGLang Setup +# inputs: +# targetType: inline +# script: | +# hostname +# set -e +# HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci +# SSH_OPTION="StrictHostKeyChecking=no" +# KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} +# : > azureuser@10.0.0.4 +# tail -f azureuser@10.0.0.4 & +# CHILD_PID=$! +# parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ +# -O $SSH_OPTION 'sudo docker run -itd --name=mscclpp-sglang-test --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash && \ +# sudo docker exec -t mscclpp-sglang-test bash -c " \ +# python3 -m venv /root/venv && \ +# git clone https://github.com/microsoft/mscclpp.git && \ +# cd mscclpp && \ +# mkdir build && \ +# cd build && \ +# cmake -DCMAKE_BUILD_TYPE=Release .. && \ +# make -j && \ +# cd .. && \ +# /root/venv/bin/pip install . && \ +# /root/venv/bin/pip install -r ./python/requirements_cuda12.txt \ +# "' +# kill $CHILD_PID +# workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 name: AllGatherTest diff --git a/test/deploy/run-remote.sh b/test/deploy/run-remote.sh index b646ea92..b87a17dd 100755 --- a/test/deploy/run-remote.sh +++ b/test/deploy/run-remote.sh @@ -11,6 +11,7 @@ # --hostfile Override hostfile path (default: test/deploy/hostfile_ci) # --host Run command on a single host (uses parallel-ssh -H) # --user SSH user when using --host or custom hostfile +# --container Docker container name to exec into (default: mscclpp-test) set -e @@ -23,9 +24,10 @@ USE_DOCKER=true USE_LOG=true TARGET_HOST="" REMOTE_USER="" +CONTAINER_NAME="mscclpp-test" usage() { - echo "Usage: $0 [--no-docker] [--no-log] [--hostfile ] [--host ] [--user ] < " >&2 + echo "Usage: $0 [--no-docker] [--no-log] [--hostfile ] [--host ] [--user ] [--container ] < " >&2 } require_value() { @@ -56,6 +58,11 @@ while [[ "$1" == --* ]]; do REMOTE_USER="$2" shift 2 ;; + --container) + require_value "--container" "${2-}" + CONTAINER_NAME="$2" + shift 2 + ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -93,6 +100,13 @@ PSSH_COMMON=( ) if $USE_DOCKER; then + # If using the sglang container, launch it first + if [ "${CONTAINER_NAME}" = "mscclpp-sglang-test" ]; then + parallel-ssh -i "${PSSH_COMMON[@]}" \ + "sudo docker rm -f ${CONTAINER_NAME} 2>/dev/null; \ + sudo docker run -itd --name=${CONTAINER_NAME} --privileged --net=host --ipc=host --gpus=all -w /root -v /mnt:/mnt lmsysorg/sglang:latest bash" + fi + INNER="set -euxo pipefail;" INNER+=" cd /root/mscclpp;" INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;" @@ -100,7 +114,7 @@ if $USE_DOCKER; then INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d | bash -euxo pipefail" parallel-ssh -i "${PSSH_COMMON[@]}" \ - "sudo docker exec mscclpp-test bash -c \"${INNER}\"" + "sudo docker exec ${CONTAINER_NAME} bash -c \"${INNER}\"" else parallel-ssh -i "${PSSH_COMMON[@]}" \ "set -euxo pipefail; CMD_B64='${CMD_B64}'; printf '%s' \"\$CMD_B64\" | base64 -d | bash -euxo pipefail"