mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
161 lines
7.6 KiB
Bash
161 lines
7.6 KiB
Bash
#!/bin/bash
|
|
# deploy.sh — Provisions remote hosts, copies sources, and launches Docker containers
|
|
# for mscclpp CI/CD test environments.
|
|
#
|
|
# Usage: deploy.sh <test_name> [ib_environment] [platform] [container_name]
|
|
# test_name : Test suite to deploy (e.g. single-node-test, nccltest-single-node)
|
|
# ib_environment : Enable InfiniBand networking (default: true)
|
|
# platform : Target GPU platform — "cuda" or "rocm" (default: cuda)
|
|
# container_name : Docker container name (default: mscclpp-test)
|
|
|
|
set -ex
|
|
|
|
###############################################################################
|
|
# 1. Parse arguments
|
|
###############################################################################
|
|
TEST_NAME=$1
|
|
IB_ENVIRONMENT="${2:-true}"
|
|
PLATFORM="${3:-cuda}"
|
|
CONTAINER_NAME="${4}"
|
|
|
|
###############################################################################
|
|
# 2. Resolve paths and host file
|
|
###############################################################################
|
|
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
|
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
|
|
DST_DIR="/tmp/mscclpp"
|
|
|
|
if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
|
|
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
|
|
else
|
|
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
|
|
fi
|
|
|
|
SSH_OPTION="StrictHostKeyChecking=no"
|
|
|
|
###############################################################################
|
|
# 3. Prepare SSH keys
|
|
###############################################################################
|
|
chmod 400 ${KeyFilePath}
|
|
ssh-keygen -t rsa -f sshkey -P ""
|
|
|
|
###############################################################################
|
|
# 4. Wait for remote hosts to be reachable
|
|
###############################################################################
|
|
while true; do
|
|
set +e
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
|
|
if [ $? -eq 0 ]; then
|
|
break
|
|
fi
|
|
echo "Waiting for sshd to start..."
|
|
sleep 5
|
|
done
|
|
set -e
|
|
|
|
###############################################################################
|
|
# 5. Copy source tree to remote hosts
|
|
###############################################################################
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
|
|
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
|
|
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
|
|
rm -f /tmp/mscclpp.tar.gz
|
|
|
|
###############################################################################
|
|
# 6. Platform-specific setup (ROCm kernel module)
|
|
###############################################################################
|
|
if [ "${PLATFORM}" == "rocm" ]; then
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
|
|
fi
|
|
|
|
###############################################################################
|
|
# 7. Pull the latest container image
|
|
###############################################################################
|
|
# Install GDRCopy kernel module on host VMs (CUDA only)
|
|
GDRCOPY_VERSION="2.5.2"
|
|
if [ "${PLATFORM}" == "cuda" ]; then
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"if lsmod | grep -q gdrdrv; then
|
|
echo 'gdrdrv module already loaded'
|
|
else
|
|
set -e
|
|
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
|
|
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
|
|
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
|
|
CUDA=/usr/local/cuda ./build-deb-packages.sh
|
|
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
|
|
sudo modprobe gdrdrv
|
|
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
|
|
fi"
|
|
fi
|
|
|
|
# force to pull the latest image
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker pull ${CONTAINERIMAGE}"
|
|
|
|
###############################################################################
|
|
# 8. Remove any existing container with the same name
|
|
###############################################################################
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker rm -f ${CONTAINER_NAME} 2>/dev/null || true"
|
|
|
|
###############################################################################
|
|
# 9. Launch Docker container
|
|
###############################################################################
|
|
|
|
if [ "${CONTAINER_NAME}" == "sglang-mscclpp-test" ]; then
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker run -itd --name=sglang-mscclpp-test --privileged --net=host --ipc=host --gpus=all -w /root -v ${DST_DIR}:/root/mscclpp --entrypoint /bin/bash lmsysorg/sglang:latest"
|
|
else
|
|
# Set GPU passthrough flags based on platform
|
|
LAUNCH_OPTION="--gpus=all"
|
|
if [ "${PLATFORM}" == "rocm" ]; then
|
|
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
|
|
fi
|
|
|
|
if [ "${IB_ENVIRONMENT}" == "true" ]; then
|
|
# InfiniBand: use --privileged for RDMA device access
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
|
|
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
|
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
|
else
|
|
# Non-IB: grant SYS_ADMIN and disable seccomp instead of full --privileged
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
|
|
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
|
|
--entrypoint /bin/bash ${CONTAINERIMAGE}"
|
|
fi
|
|
fi
|
|
|
|
###############################################################################
|
|
# 9b. Print GPU/driver info from host and container (CUDA only)
|
|
###############################################################################
|
|
if [ "${PLATFORM}" == "cuda" ]; then
|
|
echo "=== nvidia-smi on host ==="
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"nvidia-smi || echo 'nvidia-smi not available on host'; \
|
|
echo '--- /proc/driver/nvidia/version ---'; \
|
|
cat /proc/driver/nvidia/version 2>/dev/null || echo 'nvidia driver version file missing'"
|
|
|
|
echo "=== nvidia-smi inside container (${CONTAINER_NAME}) ==="
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker exec -t --user root ${CONTAINER_NAME} bash -lc \
|
|
'nvidia-smi || echo \"nvidia-smi failed in container\"; \
|
|
echo \"--- nvcc --version ---\"; \
|
|
nvcc --version || echo \"nvcc not found\"'"
|
|
fi
|
|
|
|
###############################################################################
|
|
# 10. Run setup script inside the container
|
|
###############################################################################
|
|
if [ "${CONTAINER_NAME}" = "sglang-mscclpp-test" ]; then
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker exec -t --user root sglang-mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
|
else
|
|
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
|
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"
|
|
fi
|