From d7b0dd627e2a2f90fb8b2ac4eafa6c8f09d29544 Mon Sep 17 00:00:00 2001 From: empyreus Date: Wed, 1 Apr 2026 15:31:32 +0000 Subject: [PATCH] trying to rework image pull --- .azure-pipelines/templates/deploy.yml | 5 +- .azure-pipelines/templates/sglang-test.yml | 4 + test/deploy/deploy.sh | 103 ++++++++++++++++++--- 3 files changed, 96 insertions(+), 16 deletions(-) diff --git a/.azure-pipelines/templates/deploy.yml b/.azure-pipelines/templates/deploy.yml index b02db955..46259d35 100644 --- a/.azure-pipelines/templates/deploy.yml +++ b/.azure-pipelines/templates/deploy.yml @@ -32,6 +32,9 @@ parameters: - name: deployArgs type: string default: '' +- name: containerName + type: string + default: 'mscclpp-test' steps: # 0. Ensure Azure CLI exists before running AzureCLI@2 tasks. @@ -129,5 +132,5 @@ steps: inputs: targetType: filePath filePath: test/deploy/deploy.sh - arguments: ${{ parameters.deployArgs }} + arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }} workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/templates/sglang-test.yml b/.azure-pipelines/templates/sglang-test.yml index 774de16c..72e8b08d 100644 --- a/.azure-pipelines/templates/sglang-test.yml +++ b/.azure-pipelines/templates/sglang-test.yml @@ -8,6 +8,9 @@ parameters: default: 'test/deploy/perf_ndmv4.jsonl' - name: gpuArch type: string +- name: containerName + type: string + default: 'sglang-mscclpp-test' steps: - template: deploy.yml @@ -16,6 +19,7 @@ steps: vmssName: ${{ parameters.vmssName }} gpuArch: ${{ parameters.gpuArch }} deployArgs: 'single-node-test' + containerName: ${{ parameters.containerName }} - template: run-remote-task.yml parameters: diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index 1f1d0e52..6f81fc63 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -1,26 +1,53 @@ +#!/bin/bash +# deploy.sh — Provisions remote hosts, copies sources, and launches Docker containers +# for mscclpp CI/CD test environments. +# +# Usage: deploy.sh [ib_environment] [platform] [container_name] +# test_name : Test suite to deploy (e.g. single-node-test, nccltest-single-node) +# ib_environment : Enable InfiniBand networking (default: true) +# platform : Target GPU platform — "cuda" or "rocm" (default: cuda) +# container_name : Docker container name (default: mscclpp-test) + set -ex +############################################################################### +# 1. Parse arguments +############################################################################### TEST_NAME=$1 IB_ENVIRONMENT="${2:-true}" PLATFORM="${3:-cuda}" +CONTAINER_NAME="${4:-mscclpp-test}" +############################################################################### +# 2. Resolve paths and host file +############################################################################### KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" + if [ "${TEST_NAME}" == "nccltest-single-node" ]; then ROOT_DIR="${ROOT_DIR}/mscclpp" SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp" fi + DST_DIR="/tmp/mscclpp" + if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci" else HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile" fi + SSH_OPTION="StrictHostKeyChecking=no" +############################################################################### +# 3. Prepare SSH keys +############################################################################### chmod 400 ${KeyFilePath} ssh-keygen -t rsa -f sshkey -P "" +############################################################################### +# 4. Wait for remote hosts to be reachable +############################################################################### while true; do set +e parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname" @@ -30,34 +57,80 @@ while true; do echo "Waiting for sshd to start..." sleep 5 done - set -e + +############################################################################### +# 5. Copy source tree to remote hosts +############################################################################### parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}" parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} +############################################################################### +# 6. Platform-specific setup (ROCm kernel module) +############################################################################### if [ "${PLATFORM}" == "rocm" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu" fi -# force to pull the latest image +############################################################################### +# 7. Pull the latest container image +############################################################################### parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker pull ${CONTAINERIMAGE}" -LAUNCH_OPTION="--gpus=all" -if [ "${PLATFORM}" == "rocm" ]; then - LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" +############################################################################### +# 8. Launch Docker container +############################################################################### + +if [ "${CONTAINER_NAME}" == "sglang-mscclpp-test" ]; then + # Set GPU passthrough flags based on platform + LAUNCH_OPTION="--gpus=all" + if [ "${PLATFORM}" == "rocm" ]; then + LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" + fi + + if [ "${IB_ENVIRONMENT}" == "true" ]; then + # InfiniBand: use --privileged for RDMA device access + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ + -w /root -v /mnt:/mnt -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=sglang-mscclpp-test \ + --entrypoint /bin/bash lmsysorg/sglang:latest" + else + # Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ + -w /root -v /mnt:/mnt -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=sglang-mscclpp-test \ + --entrypoint /bin/bash lmsysorg/sglang:latest" + fi +else + # Set GPU passthrough flags based on platform + LAUNCH_OPTION="--gpus=all" + if [ "${PLATFORM}" == "rocm" ]; then + LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" + fi + + if [ "${IB_ENVIRONMENT}" == "true" ]; then + # InfiniBand: use --privileged for RDMA device access + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ + --entrypoint /bin/bash ${CONTAINERIMAGE}" + else + # Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ + --entrypoint /bin/bash ${CONTAINERIMAGE}" + fi fi -if [ "${IB_ENVIRONMENT}" == "true" ]; then + +############################################################################### +# 9. Run setup script inside the container +############################################################################### +if [ "${CONTAINER_NAME}" = "sglang-mscclpp-test" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ - -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ - --entrypoint /bin/bash ${CONTAINERIMAGE}" + "sudo docker exec -t --user root sglang-mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" else parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ - -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ - --entrypoint /bin/bash ${CONTAINERIMAGE}" + "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" fi -parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" -