#!/bin/bash # Apache 2.0 License # Copyright 2024-2025 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with # the License. # # You may obtain a copy of the License at # # http://llvm.org/foundation/relicensing/LICENSE.txt # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Build a multi-CUDA wheel for the given Python version # This builds separate wheels for each supported CUDA major version, # and then merges them into a single wheel containing extensions # for all CUDA versions. At runtime, depending on the installed CUDA version, # the correct extension will be chosen. set -euo pipefail ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" usage="Usage: $0 -py-version [additional options...]" source "$ci_dir/util/python/common_arg_parser.sh" parse_python_args "$@" # Check if py_version was provided (this script requires it) require_py_version "$usage" || exit 1 echo "Docker socket: " $(ls /var/run/docker.sock) # Set HOST_WORKSPACE if not already set (for local runs) if [[ -z "${HOST_WORKSPACE:-}" ]]; then # Get the repository root HOST_WORKSPACE="$(cd "${ci_dir}/.." && pwd)" echo "Setting HOST_WORKSPACE to: $HOST_WORKSPACE" fi # cuda-bench must be built in a container that can produce manylinux wheels, # and has the CUDA toolkit installed. We use the rapidsai/ci-wheel image for this. # We build separate wheels using separate containers for each CUDA version, # then merge them into a single wheel. readonly cuda12_version=12.9.1 readonly cuda13_version=13.0.1 readonly devcontainer_version=25.12 readonly devcontainer_distro=rockylinux8 if [[ "$(uname -m)" == "aarch64" ]]; then readonly host_arch_suffix="-arm64" else readonly host_arch_suffix="" fi readonly cuda12_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda12_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix} readonly cuda13_image=rapidsai/ci-wheel:${devcontainer_version}-cuda${cuda13_version}-${devcontainer_distro}-py${py_version}${host_arch_suffix} mkdir -p wheelhouse for ctk in 12 13; do image=$(eval echo \$cuda${ctk}_image) echo "::group::⚒️ Building CUDA ${ctk} wheel on ${image}" ( set -x docker pull $image docker run --rm -i \ --workdir /workspace/python \ --mount type=bind,source=${HOST_WORKSPACE},target=/workspace/ \ --env py_version=${py_version} \ $image \ /workspace/ci/build_cuda_bench_wheel_for_cuda.sh # Prevent GHA runners from exhausting available storage with leftover images: if [[ -n "${GITHUB_ACTIONS:-}" ]]; then docker rmi -f $image fi ) echo "::endgroup::" done echo "Merging CUDA wheels..." # Detect python command if command -v python &> /dev/null; then PYTHON=python elif command -v python3 &> /dev/null; then PYTHON=python3 else echo "Error: No python found" exit 1 fi # Needed for unpacking and repacking wheels. $PYTHON -m pip install --break-system-packages wheel # Find the built wheels (temporarily suffixed with .cu12/.cu13 to avoid collision) cu12_wheel=$(find wheelhouse -name "*cu12*.whl" | head -1) cu13_wheel=$(find wheelhouse -name "*cu13*.whl" | head -1) if [[ -z "$cu12_wheel" ]]; then echo "Error: CUDA 12 wheel not found in wheelhouse/" ls -la wheelhouse/ exit 1 fi if [[ -z "$cu13_wheel" ]]; then echo "Error: CUDA 13 wheel not found in wheelhouse/" ls -la wheelhouse/ exit 1 fi if [[ "$cu12_wheel" == "$cu13_wheel" ]]; then echo "Error: Only one wheel found, expected two (CUDA 12 and CUDA 13)" ls -la wheelhouse/ exit 1 fi echo "Found CUDA 12 wheel: $cu12_wheel" echo "Found CUDA 13 wheel: $cu13_wheel" # Convert to absolute paths before changing directory cu12_wheel=$(readlink -f "$cu12_wheel") cu13_wheel=$(readlink -f "$cu13_wheel") # Merge the wheels manually mkdir -p wheelhouse_merged cd wheelhouse_merged # Unpack CUDA 12 wheel (this will be our base) $PYTHON -m wheel unpack "$cu12_wheel" base_dir=$(find . -maxdepth 1 -type d -name "cuda_bench-*" | head -1) # Unpack CUDA 13 wheel into a temporary subdirectory mkdir cu13_tmp cd cu13_tmp $PYTHON -m wheel unpack "$cu13_wheel" cu13_dir=$(find . -maxdepth 1 -type d -name "cuda_bench-*" | head -1) # Copy the cu13/ directory from CUDA 13 wheel into the base wheel cp -r "$cu13_dir"/cuda/bench/cu13 "../$base_dir/cuda/bench/" # Go back and clean up cd .. rm -rf cu13_tmp # Remove RECORD file to let wheel recreate it rm -f "$base_dir"/*.dist-info/RECORD # Repack the merged wheel $PYTHON -m wheel pack "$base_dir" cd .. # Install auditwheel and repair the merged wheel $PYTHON -m pip install --break-system-packages auditwheel for wheel in wheelhouse_merged/cuda_bench-*.whl; do echo "Repairing merged wheel: $wheel" $PYTHON -m auditwheel repair \ --exclude 'libcuda.so.1' \ --exclude 'libnvidia-ml.so.1' \ --exclude 'libcupti.so.12' \ --exclude 'libcupti.so.13' \ --exclude 'libnvperf_host.so' \ --exclude 'libnvperf_target.so' \ "$wheel" \ --wheel-dir wheelhouse_final done # Clean up intermediate files and move only the final merged wheel to wheelhouse rm -rf wheelhouse/* # Clean existing wheelhouse mkdir -p wheelhouse # Move only the final repaired merged wheel if ls wheelhouse_final/cuda_bench-*.whl 1> /dev/null 2>&1; then mv wheelhouse_final/cuda_bench-*.whl wheelhouse/ echo "Final merged wheel moved to wheelhouse" else echo "No final repaired wheel found, moving unrepaired merged wheel" mv wheelhouse_merged/cuda_bench-*.whl wheelhouse/ fi # Clean up temporary directories rm -rf wheelhouse_merged wheelhouse_final echo "Final wheels in wheelhouse:" ls -la wheelhouse/