mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
trying to rework image pull
This commit is contained in:
@@ -32,6 +32,9 @@ parameters:
|
||||
- name: deployArgs
|
||||
type: string
|
||||
default: ''
|
||||
- name: containerName
|
||||
type: string
|
||||
default: 'mscclpp-test'
|
||||
|
||||
steps:
|
||||
# 0. Ensure Azure CLI exists before running AzureCLI@2 tasks.
|
||||
@@ -129,5 +132,5 @@ steps:
|
||||
inputs:
|
||||
targetType: filePath
|
||||
filePath: test/deploy/deploy.sh
|
||||
arguments: ${{ parameters.deployArgs }}
|
||||
arguments: ${{ parameters.deployArgs }} ${{ parameters.containerName }}
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -8,6 +8,9 @@ parameters:
|
||||
default: 'test/deploy/perf_ndmv4.jsonl'
|
||||
- name: gpuArch
|
||||
type: string
|
||||
- name: containerName
|
||||
type: string
|
||||
default: 'sglang-mscclpp-test'
|
||||
|
||||
steps:
|
||||
- template: deploy.yml
|
||||
@@ -16,6 +19,7 @@ steps:
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'single-node-test'
|
||||
containerName: ${{ parameters.containerName }}
|
||||
|
||||
- template: run-remote-task.yml
|
||||
parameters:
|
||||
|
||||
@@ -1,26 +1,53 @@
|
||||
#!/bin/bash
|
||||
# deploy.sh — Provisions remote hosts, copies sources, and launches Docker containers
|
||||
# for mscclpp CI/CD test environments.
|
||||
#
|
||||
# Usage: deploy.sh <test_name> [ib_environment] [platform] [container_name]
|
||||
# test_name : Test suite to deploy (e.g. single-node-test, nccltest-single-node)
|
||||
# ib_environment : Enable InfiniBand networking (default: true)
|
||||
# platform : Target GPU platform — "cuda" or "rocm" (default: cuda)
|
||||
# container_name : Docker container name (default: mscclpp-test)
|
||||
|
||||
set -ex
|
||||
|
||||
###############################################################################
|
||||
# 1. Parse arguments
|
||||
###############################################################################
|
||||
TEST_NAME=$1
|
||||
IB_ENVIRONMENT="${2:-true}"
|
||||
PLATFORM="${3:-cuda}"
|
||||
CONTAINER_NAME="${4:-mscclpp-test}"
|
||||
|
||||
###############################################################################
|
||||
# 2. Resolve paths and host file
|
||||
###############################################################################
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
|
||||
|
||||
if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
|
||||
ROOT_DIR="${ROOT_DIR}/mscclpp"
|
||||
SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
|
||||
fi
|
||||
|
||||
DST_DIR="/tmp/mscclpp"
|
||||
|
||||
if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
|
||||
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
|
||||
else
|
||||
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
|
||||
fi
|
||||
|
||||
SSH_OPTION="StrictHostKeyChecking=no"
|
||||
|
||||
###############################################################################
|
||||
# 3. Prepare SSH keys
|
||||
###############################################################################
|
||||
chmod 400 ${KeyFilePath}
|
||||
ssh-keygen -t rsa -f sshkey -P ""
|
||||
|
||||
###############################################################################
|
||||
# 4. Wait for remote hosts to be reachable
|
||||
###############################################################################
|
||||
while true; do
|
||||
set +e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
|
||||
@@ -30,34 +57,80 @@ while true; do
|
||||
echo "Waiting for sshd to start..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
set -e
|
||||
|
||||
###############################################################################
|
||||
# 5. Copy source tree to remote hosts
|
||||
###############################################################################
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
|
||||
###############################################################################
|
||||
# 6. Platform-specific setup (ROCm kernel module)
|
||||
###############################################################################
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
|
||||
fi
|
||||
|
||||
# force to pull the latest image
|
||||
###############################################################################
|
||||
# 7. Pull the latest container image
|
||||
###############################################################################
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker pull ${CONTAINERIMAGE}"
|
||||
|
||||
LAUNCH_OPTION="--gpus=all"
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
|
||||
###############################################################################
|
||||
# 8. Launch Docker container
|
||||
###############################################################################
|
||||
|
||||
if [ "${CONTAINER_NAME}" == "sglang-mscclpp-test" ]; then
|
||||
# Set GPU passthrough flags based on platform
|
||||
LAUNCH_OPTION="--gpus=all"
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
|
||||
fi
|
||||
|
||||
if [ "${IB_ENVIRONMENT}" == "true" ]; then
|
||||
# InfiniBand: use --privileged for RDMA device access
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
|
||||
-w /root -v /mnt:/mnt -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=sglang-mscclpp-test \
|
||||
--entrypoint /bin/bash lmsysorg/sglang:latest"
|
||||
else
|
||||
# Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
||||
-w /root -v /mnt:/mnt -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=sglang-mscclpp-test \
|
||||
--entrypoint /bin/bash lmsysorg/sglang:latest"
|
||||
fi
|
||||
else
|
||||
# Set GPU passthrough flags based on platform
|
||||
LAUNCH_OPTION="--gpus=all"
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
|
||||
fi
|
||||
|
||||
if [ "${IB_ENVIRONMENT}" == "true" ]; then
|
||||
# InfiniBand: use --privileged for RDMA device access
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
else
|
||||
# Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
fi
|
||||
fi
|
||||
if [ "${IB_ENVIRONMENT}" == "true" ]; then
|
||||
|
||||
###############################################################################
|
||||
# 9. Run setup script inside the container
|
||||
###############################################################################
|
||||
if [ "${CONTAINER_NAME}" = "sglang-mscclpp-test" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
"sudo docker exec -t --user root sglang-mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
||||
else
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
||||
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
||||
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
||||
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
||||
fi
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user