Fix multi-node H100 CI: CUDA compat, deploy improvements (#781)

## Summary

- **Multi-node H100 CI setup**: Improve architecture detection and GPU
configuration
- **Remove hardcoded VMSS hostnames** from deploy files
- **Fix CUDA compat library issue**: Remove stale compat paths from
Docker image for CUDA 12+. Instead, `peer_access_test` now returns a
distinct exit code (2) for CUDA init failure, and `setup.sh`
conditionally adds compat libs only when needed. This fixes
`cudaErrorSystemNotReady` (error 803) when the host driver is newer than
the container's compat libs.
- **Speed up deploy**: Replace recursive `parallel-scp` with
tar+scp+untar to avoid per-file SSH overhead.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-04-13 21:51:29 -07:00
committed by GitHub
parent b6d0ca13ca
commit ecd33722d4
12 changed files with 200 additions and 88 deletions

View File

@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys
mv /root/mscclpp/test/deploy/config /root/.ssh/config
chown root:root /root/.ssh/config
chmod 400 /root/mscclpp/sshkey
chown root:root /root/mscclpp/sshkey
# Generate SSH config from hostfile_mpi
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
if [ -f "${HOSTFILE_MPI}" ]; then
> /root/.ssh/config
while IFS= read -r host; do
echo "Host ${host}" >> /root/.ssh/config
echo " Port 22345" >> /root/.ssh/config
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
echo " StrictHostKeyChecking no" >> /root/.ssh/config
done < "${HOSTFILE_MPI}"
chown root:root /root/.ssh/config
fi
if [ "${PLATFORM}" == "cuda" ]; then
nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
@@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then
fi
make -C /root/mscclpp/tools/peer-access-test
set +e
/root/mscclpp/tools/peer-access-test/peer_access_test
PEER_ACCESS_EXIT_CODE=$?
set -e
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
# Add CUDA compat libs for forward compatibility and retry.
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
if [ -d "${CUDA_COMPAT_PATH}" ]; then
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
/root/mscclpp/tools/peer-access-test/peer_access_test
else
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
exit 1
fi
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
exit ${PEER_ACCESS_EXIT_CODE}
fi
make -C /root/mscclpp/tools/peer-access-test clean
if [[ "${CUDA_VERSION}" == *"11."* ]]; then