Fix multi-node H100 CI: CUDA compat, deploy improvements (#781)

## Summary

- **Multi-node H100 CI setup**: Improve architecture detection and GPU
configuration
- **Remove hardcoded VMSS hostnames** from deploy files
- **Fix CUDA compat library issue**: Remove stale compat paths from
Docker image for CUDA 12+. Instead, `peer_access_test` now returns a
distinct exit code (2) for CUDA init failure, and `setup.sh`
conditionally adds compat libs only when needed. This fixes
`cudaErrorSystemNotReady` (error 803) when the host driver is newer than
the container's compat libs.
- **Speed up deploy**: Replace recursive `parallel-scp` with
tar+scp+untar to avoid per-file SSH overhead.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-04-13 21:51:29 -07:00
committed by GitHub
parent b6d0ca13ca
commit ecd33722d4
12 changed files with 200 additions and 88 deletions

View File

@@ -1,8 +0,0 @@
Host mscclit-000000
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host mscclit-000001
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no

View File

@@ -33,12 +33,34 @@ done
set -e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
rm -f /tmp/mscclpp.tar.gz
if [ "${PLATFORM}" == "rocm" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
fi
# Install GDRCopy kernel module on host VMs (CUDA only)
GDRCOPY_VERSION="2.5.2"
if [ "${PLATFORM}" == "cuda" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"if lsmod | grep -q gdrdrv; then
echo 'gdrdrv module already loaded'
else
set -e
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
CUDA=/usr/local/cuda ./build-deb-packages.sh
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
sudo modprobe gdrdrv
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
fi"
fi
# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"

View File

@@ -1,2 +0,0 @@
azureuser@mscclit-000000
azureuser@mscclit-000001

View File

@@ -1,2 +0,0 @@
mscclit-000000
mscclit-000001

View File

@@ -1,3 +1,10 @@
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.54, "busBw":1.01, "size":8192, "time":15.10, "target":"latency"}
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":118.49,"busBw":222.17, "size":25165824, "time":212.39, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":138.48,"busBw":259.65, "size":50331648, "time":363.40, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":96.94, "busBw":90.88, "size":1073741824, "time":11076.24,"target":"throughput"}

View File

@@ -1,83 +1,99 @@
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
HEAD_HOST=$(head -1 ${HOSTFILE})
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
if [ -z "${HEAD_IP}" ]; then
HEAD_IP=${HEAD_HOST}
fi
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
# Select perf baseline based on GPU type
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
if echo "${GPU_NAME}" | grep -qi "H100"; then
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
else
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
fi
function run_mscclpp_test()
{
echo "=================Run allgather_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
# For kernel 2, the message size must can be divided by 3
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
echo "==================Run allreduce_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
echo "==================Run alltoall_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
echo "========================Run performance check==============================="
python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
--baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
--baseline-file ${PERF_BASELINE}
}
function run_mp_ut()
{
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
mpirun ${MPI_ARGS} -tag-output -np 2 \
${MSCCLPP_ENV} \
-npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
}
function run_pytests()
{
echo "==================Run python tests================================"
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
}
function run_py_benchmark()
{
echo "==================Run python benchmark================================"
mpirun -allow-run-as-root -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
-x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
}
if [ $# -lt 1 ]; then

View File

@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys
mv /root/mscclpp/test/deploy/config /root/.ssh/config
chown root:root /root/.ssh/config
chmod 400 /root/mscclpp/sshkey
chown root:root /root/mscclpp/sshkey
# Generate SSH config from hostfile_mpi
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
if [ -f "${HOSTFILE_MPI}" ]; then
> /root/.ssh/config
while IFS= read -r host; do
echo "Host ${host}" >> /root/.ssh/config
echo " Port 22345" >> /root/.ssh/config
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
echo " StrictHostKeyChecking no" >> /root/.ssh/config
done < "${HOSTFILE_MPI}"
chown root:root /root/.ssh/config
fi
if [ "${PLATFORM}" == "cuda" ]; then
nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
@@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then
fi
make -C /root/mscclpp/tools/peer-access-test
set +e
/root/mscclpp/tools/peer-access-test/peer_access_test
PEER_ACCESS_EXIT_CODE=$?
set -e
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
# Add CUDA compat libs for forward compatibility and retry.
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
if [ -d "${CUDA_COMPAT_PATH}" ]; then
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
/root/mscclpp/tools/peer-access-test/peer_access_test
else
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
exit 1
fi
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
exit ${PEER_ACCESS_EXIT_CODE}
fi
make -C /root/mscclpp/tools/peer-access-test clean
if [[ "${CUDA_VERSION}" == *"11."* ]]; then