Files
mscclpp/test/deploy/deploy.sh
Binyang Li 572028ea3d Fix nccl-test CI building for all GPU architectures (#786)
## Problem

`nccl-test.yml` was the only CI template calling `deploy.yml` without
passing `gpuArch`. Since the CI build machine has no GPU, CMake fell
back to building for **all** supported architectures (`80;90;100;120`),
unnecessarily slowing down CI builds.

## Fix

- Add `gpuArch` parameter to `nccl-test.yml` and forward it to
`deploy.yml`
- Pass `gpuArch: '80'` (A100) and `gpuArch: '90'` (H100) from
`nccl-api-test.yml`

All other templates were already passing `gpuArch` correctly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-15 12:55:40 -07:00

82 lines
3.3 KiB
Bash

set -ex
TEST_NAME=$1
IB_ENVIRONMENT="${2:-true}"
PLATFORM="${3:-cuda}"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
DST_DIR="/tmp/mscclpp"
if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
else
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
fi
SSH_OPTION="StrictHostKeyChecking=no"
chmod 400 ${KeyFilePath}
ssh-keygen -t rsa -f sshkey -P ""
while true; do
set +e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "hostname"
if [ $? -eq 0 ]; then
break
fi
echo "Waiting for sshd to start..."
sleep 5
done
set -e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
rm -f /tmp/mscclpp.tar.gz
if [ "${PLATFORM}" == "rocm" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
fi
# Install GDRCopy kernel module on host VMs (CUDA only)
GDRCOPY_VERSION="2.5.2"
if [ "${PLATFORM}" == "cuda" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"if lsmod | grep -q gdrdrv; then
echo 'gdrdrv module already loaded'
else
set -e
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
CUDA=/usr/local/cuda ./build-deb-packages.sh
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
sudo modprobe gdrdrv
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
fi"
fi
# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"
LAUNCH_OPTION="--gpus=all"
if [ "${PLATFORM}" == "rocm" ]; then
LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video"
fi
if [ "${IB_ENVIRONMENT}" == "true" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
else
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
fi
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}"