Files
mscclpp/test/deploy/setup.sh
Binyang Li eeea00b298 Support python wheel build (#787)
## Support Python wheel build

This PR modernizes the Python packaging for MSCCL++ by defining
dependencies and optional extras in `pyproject.toml`, enabling proper
wheel builds with `pip install ".[cuda12]"`.

### Changes

**`pyproject.toml`**
- Add `dependencies` (numpy, blake3, pybind11, sortedcontainers)
- Add `optional-dependencies` for platform-specific CuPy (`cuda11`,
`cuda12`, `cuda13`, `rocm6`), `benchmark`, and `test` extras
- Bump minimum Python version from 3.8 to 3.10

**`test/deploy/setup.sh`**
- Use `pip install ".[<platform>,benchmark,test]"` instead of separate
`pip install -r requirements_*.txt` + `pip install .` steps
- Add missing CUDA 13 case

**`docs/quickstart.md`**
- Update install instructions to use extras (e.g., `pip install
".[cuda12]"`)
- Document all available extras and clarify that `rocm6` builds CuPy
from source
- Update Python version references to 3.10

**`python/csrc/CMakeLists.txt`**, **`python/test/CMakeLists.txt`**
- Update `find_package(Python)` from 3.8 to 3.10

### Notes
- The `requirements_*.txt` files are kept for Docker base image builds
where only dependencies (not the project itself) should be installed.
- CuPy is intentionally not in base dependencies — users must specify a
platform extra to get the correct pre-built wheel (or source build for
ROCm).

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-16 21:24:45 -07:00

80 lines
2.6 KiB
Bash

set -e
PLATFORM="${1:-cuda}"
mkdir -p /root/.ssh
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
chown root:root /root/.ssh/authorized_keys
chmod 400 /root/mscclpp/sshkey
chown root:root /root/mscclpp/sshkey
# Generate SSH config from hostfile_mpi
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
if [ -f "${HOSTFILE_MPI}" ]; then
> /root/.ssh/config
while IFS= read -r host; do
echo "Host ${host}" >> /root/.ssh/config
echo " Port 22345" >> /root/.ssh/config
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
echo " StrictHostKeyChecking no" >> /root/.ssh/config
done < "${HOSTFILE_MPI}"
chown root:root /root/.ssh/config
fi
if [ "${PLATFORM}" == "cuda" ]; then
nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
fi
make -C /root/mscclpp/tools/peer-access-test
set +e
/root/mscclpp/tools/peer-access-test/peer_access_test
PEER_ACCESS_EXIT_CODE=$?
set -e
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
# Add CUDA compat libs for forward compatibility and retry.
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
if [ -d "${CUDA_COMPAT_PATH}" ]; then
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
/root/mscclpp/tools/peer-access-test/peer_access_test
else
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
exit 1
fi
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
exit ${PEER_ACCESS_EXIT_CODE}
fi
make -C /root/mscclpp/tools/peer-access-test clean
if [ "${PLATFORM}" == "rocm" ]; then
export CXX=/opt/rocm/bin/hipcc
fi
PIP_CMAKE_ARGS_FILE="/root/mscclpp/pip_cmake_args.txt"
if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
fi
cd /root/mscclpp
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
pip3 install ".[cuda11,benchmark,test]"
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
pip3 install ".[cuda12,benchmark,test]"
elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
pip3 install ".[cuda13,benchmark,test]"
elif [ "${PLATFORM}" == "rocm" ]; then
pip3 install ".[rocm6,benchmark,test]"
else
pip3 install ".[benchmark,test]"
fi
pip3 install setuptools_scm
python3 -m setuptools_scm --force-write-version-files
mkdir -p /var/run/sshd
/usr/sbin/sshd -p 22345