mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
88 lines
2.8 KiB
Bash
88 lines
2.8 KiB
Bash
set -e
|
|
|
|
PLATFORM="${1:-cuda}"
|
|
|
|
# Create a Python venv if one is not already active
|
|
if [ -z "${VIRTUAL_ENV}" ] && [ ! -f /root/venv/bin/activate ]; then
|
|
python3 -m venv /root/venv
|
|
fi
|
|
if [ -f /root/venv/bin/activate ]; then
|
|
. /root/venv/bin/activate
|
|
fi
|
|
|
|
mkdir -p /root/.ssh
|
|
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
|
chown root:root /root/.ssh/authorized_keys
|
|
chmod 400 /root/mscclpp/sshkey
|
|
chown root:root /root/mscclpp/sshkey
|
|
|
|
# Generate SSH config from hostfile_mpi
|
|
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
|
|
if [ -f "${HOSTFILE_MPI}" ]; then
|
|
> /root/.ssh/config
|
|
while IFS= read -r host; do
|
|
echo "Host ${host}" >> /root/.ssh/config
|
|
echo " Port 22345" >> /root/.ssh/config
|
|
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
|
|
echo " StrictHostKeyChecking no" >> /root/.ssh/config
|
|
done < "${HOSTFILE_MPI}"
|
|
chown root:root /root/.ssh/config
|
|
fi
|
|
|
|
if [ "${PLATFORM}" == "cuda" ]; then
|
|
nvidia-smi -pm 1
|
|
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
|
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
|
done
|
|
fi
|
|
|
|
make -C /root/mscclpp/tools/peer-access-test
|
|
set +e
|
|
/root/mscclpp/tools/peer-access-test/peer_access_test
|
|
PEER_ACCESS_EXIT_CODE=$?
|
|
set -e
|
|
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
|
|
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
|
|
# Add CUDA compat libs for forward compatibility and retry.
|
|
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
|
|
if [ -d "${CUDA_COMPAT_PATH}" ]; then
|
|
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
|
|
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
|
|
/root/mscclpp/tools/peer-access-test/peer_access_test
|
|
else
|
|
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
|
|
exit 1
|
|
fi
|
|
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
|
|
exit ${PEER_ACCESS_EXIT_CODE}
|
|
fi
|
|
make -C /root/mscclpp/tools/peer-access-test clean
|
|
|
|
if [ "${PLATFORM}" == "rocm" ]; then
|
|
export CXX=/opt/rocm/bin/hipcc
|
|
fi
|
|
|
|
PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
|
|
if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
|
|
export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
|
|
echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
|
|
fi
|
|
|
|
cd /root/mscclpp
|
|
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
|
|
pip3 install ".[cuda11,benchmark,test]"
|
|
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
|
|
pip3 install ".[cuda12,benchmark,test]"
|
|
elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
|
|
pip3 install ".[cuda13,benchmark,test]"
|
|
elif [ "${PLATFORM}" == "rocm" ]; then
|
|
pip3 install ".[rocm6,benchmark,test]"
|
|
else
|
|
pip3 install ".[benchmark,test]"
|
|
fi
|
|
pip3 install setuptools_scm
|
|
python3 -m setuptools_scm --force-write-version-files
|
|
|
|
mkdir -p /var/run/sshd
|
|
/usr/sbin/sshd -p 22345
|