mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Fix multi-node H100 CI: CUDA compat, deploy improvements (#781)
## Summary - **Multi-node H100 CI setup**: Improve architecture detection and GPU configuration - **Remove hardcoded VMSS hostnames** from deploy files - **Fix CUDA compat library issue**: Remove stale compat paths from Docker image for CUDA 12+. Instead, `peer_access_test` now returns a distinct exit code (2) for CUDA init failure, and `setup.sh` conditionally adds compat libs only when needed. This fixes `cudaErrorSystemNotReady` (error 803) when the host driver is newer than the container's compat libs. - **Speed up deploy**: Replace recursive `parallel-scp` with tar+scp+untar to avoid per-file SSH overhead. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -1,8 +0,0 @@
|
||||
Host mscclit-000000
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host mscclit-000001
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
@@ -33,12 +33,34 @@ done
|
||||
|
||||
set -e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
|
||||
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
|
||||
rm -f /tmp/mscclpp.tar.gz
|
||||
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
|
||||
fi
|
||||
|
||||
# Install GDRCopy kernel module on host VMs (CUDA only)
|
||||
GDRCOPY_VERSION="2.5.2"
|
||||
if [ "${PLATFORM}" == "cuda" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"if lsmod | grep -q gdrdrv; then
|
||||
echo 'gdrdrv module already loaded'
|
||||
else
|
||||
set -e
|
||||
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
|
||||
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
|
||||
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
|
||||
CUDA=/usr/local/cuda ./build-deb-packages.sh
|
||||
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
|
||||
sudo modprobe gdrdrv
|
||||
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
|
||||
fi"
|
||||
fi
|
||||
|
||||
# force to pull the latest image
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker pull ${CONTAINERIMAGE}"
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
azureuser@mscclit-000000
|
||||
azureuser@mscclit-000001
|
||||
@@ -1,2 +0,0 @@
|
||||
mscclit-000000
|
||||
mscclit-000001
|
||||
@@ -1,3 +1,10 @@
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
|
||||
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.54, "busBw":1.01, "size":8192, "time":15.10, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":118.49,"busBw":222.17, "size":25165824, "time":212.39, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":138.48,"busBw":259.65, "size":50331648, "time":363.40, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
|
||||
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":96.94, "busBw":90.88, "size":1073741824, "time":11076.24,"target":"throughput"}
|
||||
@@ -1,83 +1,99 @@
|
||||
set -e
|
||||
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
|
||||
HEAD_HOST=$(head -1 ${HOSTFILE})
|
||||
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
|
||||
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
|
||||
if [ -z "${HEAD_IP}" ]; then
|
||||
HEAD_IP=${HEAD_HOST}
|
||||
fi
|
||||
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
|
||||
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
|
||||
|
||||
# Select perf baseline based on GPU type
|
||||
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
|
||||
if echo "${GPU_NAME}" | grep -qi "H100"; then
|
||||
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
|
||||
else
|
||||
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
|
||||
fi
|
||||
|
||||
function run_mscclpp_test()
|
||||
{
|
||||
echo "=================Run allgather_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
# For kernel 2, the message size must can be divided by 3
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "==================Run allreduce_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "==================Run alltoall_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "========================Run performance check==============================="
|
||||
python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
|
||||
--baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
|
||||
--baseline-file ${PERF_BASELINE}
|
||||
}
|
||||
|
||||
function run_mp_ut()
|
||||
{
|
||||
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
|
||||
mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
|
||||
mpirun ${MPI_ARGS} -tag-output -np 2 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
|
||||
|
||||
echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
|
||||
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
|
||||
mpirun ${MPI_ARGS} -tag-output -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
|
||||
}
|
||||
|
||||
function run_pytests()
|
||||
{
|
||||
echo "==================Run python tests================================"
|
||||
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
mpirun ${MPI_ARGS} -tag-output -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
|
||||
}
|
||||
|
||||
function run_py_benchmark()
|
||||
{
|
||||
echo "==================Run python benchmark================================"
|
||||
mpirun -allow-run-as-root -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
|
||||
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
|
||||
-x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
|
||||
-x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
|
||||
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
|
||||
}
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
|
||||
@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
|
||||
mkdir -p /root/.ssh
|
||||
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
||||
chown root:root /root/.ssh/authorized_keys
|
||||
mv /root/mscclpp/test/deploy/config /root/.ssh/config
|
||||
chown root:root /root/.ssh/config
|
||||
chmod 400 /root/mscclpp/sshkey
|
||||
chown root:root /root/mscclpp/sshkey
|
||||
|
||||
# Generate SSH config from hostfile_mpi
|
||||
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
|
||||
if [ -f "${HOSTFILE_MPI}" ]; then
|
||||
> /root/.ssh/config
|
||||
while IFS= read -r host; do
|
||||
echo "Host ${host}" >> /root/.ssh/config
|
||||
echo " Port 22345" >> /root/.ssh/config
|
||||
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
|
||||
echo " StrictHostKeyChecking no" >> /root/.ssh/config
|
||||
done < "${HOSTFILE_MPI}"
|
||||
chown root:root /root/.ssh/config
|
||||
fi
|
||||
|
||||
if [ "${PLATFORM}" == "cuda" ]; then
|
||||
nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
@@ -18,7 +29,25 @@ if [ "${PLATFORM}" == "cuda" ]; then
|
||||
fi
|
||||
|
||||
make -C /root/mscclpp/tools/peer-access-test
|
||||
set +e
|
||||
/root/mscclpp/tools/peer-access-test/peer_access_test
|
||||
PEER_ACCESS_EXIT_CODE=$?
|
||||
set -e
|
||||
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
|
||||
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
|
||||
# Add CUDA compat libs for forward compatibility and retry.
|
||||
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
|
||||
if [ -d "${CUDA_COMPAT_PATH}" ]; then
|
||||
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
|
||||
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
|
||||
/root/mscclpp/tools/peer-access-test/peer_access_test
|
||||
else
|
||||
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
|
||||
exit ${PEER_ACCESS_EXIT_CODE}
|
||||
fi
|
||||
make -C /root/mscclpp/tools/peer-access-test clean
|
||||
|
||||
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
|
||||
|
||||
Reference in New Issue
Block a user