diff --git a/docs/quickstart.md b/docs/quickstart.md index c9c98128..83a08d6a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -25,9 +25,9 @@ ```bash sudo apt-get install libnuma-dev ``` - * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package + * (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package ```bash - sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)" + sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)" ``` If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)). * (Optional, for benchmarks) MPI @@ -100,13 +100,30 @@ There are a few optional CMake options you can set: (install-from-source-python-module)= ## Install from Source (Python Module) -Python 3.8 or later is required. +Python 3.10 or later is required. ```bash -# For NVIDIA platforms -$ python -m pip install . -# For AMD platforms, set the C++ compiler to HIPCC -$ CXX=/opt/rocm/bin/hipcc python -m pip install . +# For NVIDIA platforms (specify your CUDA version) +$ python -m pip install ".[cuda12]" +# For AMD platforms +$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]" +``` + +> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy. +> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source, +> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy. + +Optional extras can be installed by specifying them in brackets. Available extras: +- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version. +- **`rocm6`**: Install CuPy from source for AMD ROCm platforms. +- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib). +- **`test`**: Install test dependencies (pytest, mpi4py, netifaces). + +```bash +# Example: install with CUDA 12 and benchmark extras +$ python -m pip install ".[cuda12,benchmark]" +# Example: install with all extras for testing on CUDA 12 +$ python -m pip install ".[cuda12,benchmark,test]" ``` (vscode-dev-container)= @@ -158,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0 [Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. ```bash -# Choose `requirements_*.txt` according to your CUDA/ROCm version. -$ python3 -m pip install -r ./python/requirements_cuda12.txt +# Install with benchmark dependencies and the appropriate CUDA/ROCm extras. +# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6. +$ python3 -m pip install ".[cuda12,benchmark,test]" $ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py ``` diff --git a/pyproject.toml b/pyproject.toml index 651fec3b..0ea569cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build" name = "mscclpp" dynamic = ["version"] description = "MSCCL++ Python API" -requires-python = ">=3.8" +requires-python = ">=3.10" +dependencies = [ + "numpy", + "blake3", + "pybind11", + "sortedcontainers", +] + +[project.optional-dependencies] +cuda11 = ["cupy-cuda11x"] +cuda12 = ["cupy-cuda12x"] +cuda13 = ["cupy-cuda13x"] +rocm6 = ["cupy"] +benchmark = [ + "mpi4py", + "prettytable", + "netifaces", + "matplotlib", +] +test = [ + "pytest", + "mpi4py", + "netifaces", +] [tool.setuptools_scm] write_to = "python/mscclpp/_version.py" @@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF" [tool.black] line-length = 120 -target-version = ['py38'] +target-version = ['py310'] include = '\.pyi?$' diff --git a/python/csrc/CMakeLists.txt b/python/csrc/CMakeLists.txt index 44fb150f..7c7bf3b9 100644 --- a/python/csrc/CMakeLists.txt +++ b/python/csrc/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) +find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED) include(FetchContent) FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2) FetchContent_MakeAvailable(nanobind) diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py index b2da976d..3b77ce8e 100644 --- a/python/mscclpp/_core/compiler.py +++ b/python/mscclpp/_core/compiler.py @@ -192,6 +192,9 @@ class NativeCodeCompiler: """ def __init__(self): + self._initialized = False + + def _do_init(self): self._is_hip = cp.cuda.runtime.is_hip self._device_arch = get_device_arch() self._compiler = self._get_compiler() @@ -226,6 +229,7 @@ class NativeCodeCompiler: ] self._cache_dir = Path(env().cache_dir) / "native" self._cache_dir.mkdir(parents=True, exist_ok=True) + self._initialized = True def _get_compiler(self) -> str: """Get the path to the appropriate compiler. @@ -246,6 +250,8 @@ class NativeCodeCompiler: Returns: str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD). """ + if not self._initialized: + self._do_init() return self._device_arch def __call__(self, name: str, file: str, **kwds): @@ -290,6 +296,8 @@ class NativeCodeCompiler: >>> # Use the module to create an algorithm >>> algo = module.create_allreduce_algorithm(comm, buffer, size) """ + if not self._initialized: + self._do_init() if not os.path.isfile(file): raise FileNotFoundError(f"The specified source file does not exist: {file}") diff --git a/python/requirements_cuda11.txt b/python/requirements_cuda11.txt index 4e2e9371..a9786071 100644 --- a/python/requirements_cuda11.txt +++ b/python/requirements_cuda11.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_cuda12.txt b/python/requirements_cuda12.txt index e1c9b726..71572714 100644 --- a/python/requirements_cuda12.txt +++ b/python/requirements_cuda12.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt index 49cf13bc..95e99533 100644 --- a/python/requirements_cuda13.txt +++ b/python/requirements_cuda13.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt index 7ed4fef3..757d4e26 100644 --- a/python/requirements_rocm6.txt +++ b/python/requirements_rocm6.txt @@ -5,6 +5,6 @@ netifaces pytest numpy matplotlib -sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +sortedcontainers blake3 pybind11 \ No newline at end of file diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt index be62aea9..e55711d2 100644 --- a/python/test/CMakeLists.txt +++ b/python/test/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) +find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED) include(FetchContent) FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0) FetchContent_MakeAvailable(nanobind) diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index bc29efd8..2a88a310 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -50,12 +50,6 @@ elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then fi make -C /root/mscclpp/tools/peer-access-test clean -if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cuda11.txt -elif [[ "${CUDA_VERSION}" == *"12."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cuda12.txt -fi - if [ "${PLATFORM}" == "rocm" ]; then export CXX=/opt/rocm/bin/hipcc fi @@ -65,7 +59,19 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})" echo "Using CMAKE_ARGS: ${CMAKE_ARGS}" fi -cd /root/mscclpp && pip3 install . + +cd /root/mscclpp +if [[ "${CUDA_VERSION}" == *"11."* ]]; then + pip3 install ".[cuda11,benchmark,test]" +elif [[ "${CUDA_VERSION}" == *"12."* ]]; then + pip3 install ".[cuda12,benchmark,test]" +elif [[ "${CUDA_VERSION}" == *"13."* ]]; then + pip3 install ".[cuda13,benchmark,test]" +elif [ "${PLATFORM}" == "rocm" ]; then + pip3 install ".[rocm6,benchmark,test]" +else + pip3 install ".[benchmark,test]" +fi pip3 install setuptools_scm python3 -m setuptools_scm --force-write-version-files