From 559a3ad4ac093d014793d65bea6f9a4eb113f0b9 Mon Sep 17 00:00:00 2001 From: Jianwei Dong Date: Mon, 29 Dec 2025 11:19:43 +0800 Subject: [PATCH] fix pypi cuda install (#1763) --- .github/workflows/release-pypi.yml | 134 +++++++++++++++++++++++++++-- kt-kernel/CMakeLists.txt | 47 +++++++++- kt-kernel/README.md | 47 +++++++++- kt-kernel/setup.py | 34 ++++++-- 4 files changed, 243 insertions(+), 19 deletions(-) diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 9375d6a..e430e1b 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -89,18 +89,42 @@ jobs: pip install auditwheel patchelf echo "Repairing wheels for manylinux compatibility..." mkdir -p wheelhouse + for wheel in dist/*.whl; do echo "Processing $wheel..." - auditwheel repair "$wheel" --plat manylinux_2_17_x86_64 -w wheelhouse/ || { - echo "Warning: auditwheel repair failed, trying to rename platform tag..." - # Fallback: rename the wheel file with manylinux tag + success=0 + + # Try different manylinux versions (newest to oldest) + for plat in manylinux_2_31_x86_64 manylinux_2_28_x86_64 manylinux_2_17_x86_64; do + echo " Trying $plat..." + if auditwheel repair "$wheel" --plat "$plat" -w wheelhouse/ 2>&1; then + echo " ✓ Successfully repaired with $plat" + success=1 + break + fi + done + + # If all auditwheel attempts failed, use rename fallback + if [ $success -eq 0 ]; then + echo " Warning: auditwheel repair failed, using rename fallback..." wheel_name=$(basename "$wheel") - new_name=$(echo "$wheel_name" | sed 's/linux_x86_64/manylinux_2_17_x86_64/') + # Use # as sed delimiter to avoid conflict with / + new_name=$(echo "$wheel_name" | sed 's#linux_x86_64#manylinux_2_17_x86_64#') cp "$wheel" "wheelhouse/$new_name" - } + echo " ✓ Renamed to $new_name" + fi done + echo "Repaired wheels:" ls -lh wheelhouse/ + + # Verify all wheels contain 3 CPU variants + echo "Verifying CPU variants in repaired wheels..." + for wheel in wheelhouse/*.whl; do + echo "Checking $(basename $wheel):" + python -m zipfile -l "$wheel" | grep "\.so" | grep -E "(amx|avx512|avx2)" + done + # Replace original wheels with repaired ones rm -f dist/*.whl cp wheelhouse/*.whl dist/ @@ -112,9 +136,87 @@ jobs: path: kt-kernel/dist/*.whl retention-days: 7 + build-kt-kernel-cuda: + name: Build kt-kernel CUDA (Python ${{ matrix.python-version }}) + runs-on: [self-hosted, linux, x64, gpu] + strategy: + fail-fast: false + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Verify CUDA availability + run: | + nvidia-smi || (echo "ERROR: GPU not available" && exit 1) + nvcc --version || (echo "ERROR: CUDA toolkit not found" && exit 1) + + - name: Install dependencies + run: | + apt-get update && apt-get install -y cmake libhwloc-dev pkg-config libnuma-dev + python -m pip install --upgrade pip + pip install build wheel setuptools torch --index-url https://download.pytorch.org/whl/cu118 + + - name: Build CUDA wheel + working-directory: kt-kernel + env: + CPUINFER_USE_CUDA: '1' + CPUINFER_CUDA_ARCHS: '80;86;89;90' + CPUINFER_CUDA_STATIC_RUNTIME: '1' + CPUINFER_BUILD_TYPE: 'Release' + CPUINFER_PARALLEL: '4' + CPUINFER_FORCE_REBUILD: '1' + CUDA_HOME: '/usr/local/cuda-11.8' + run: | + echo "Building CUDA wheel for SM 80, 86, 89, 90" + python -m build --wheel -v + + - name: Verify wheel + working-directory: kt-kernel + run: | + ls -lh dist/ + # Check version suffix + [[ $(ls dist/*.whl) == *"+cuda118"* ]] || (echo "ERROR: Missing +cuda118 suffix" && exit 1) + + # Install and test + pip install dist/*.whl + python -c "import kt_kernel; print(f'Version: {kt_kernel.__version__}')" + + # Verify static linking (should NOT depend on libcudart.so) + unzip -q dist/*.whl -d /tmp/check + ! ldd /tmp/check/kt_kernel/*.so | grep -q "libcudart.so" || (echo "ERROR: Dynamic cudart found" && exit 1) + echo "✓ CUDA runtime statically linked" + + - name: Repair wheel for manylinux + working-directory: kt-kernel + run: | + pip install auditwheel patchelf + mkdir -p wheelhouse + for wheel in dist/*.whl; do + auditwheel repair "$wheel" --plat manylinux_2_17_x86_64 --exclude libcuda.so.1 -w wheelhouse/ || \ + cp "$wheel" wheelhouse/$(basename "$wheel" | sed 's/linux_x86_64/manylinux_2_17_x86_64/') + done + rm -f dist/*.whl && cp wheelhouse/*.whl dist/ + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: kt-kernel-cuda-wheels-py${{ matrix.python-version }} + path: kt-kernel/dist/*.whl + retention-days: 7 + publish-pypi: name: Publish to PyPI - needs: build-kt-kernel + needs: [build-kt-kernel, build-kt-kernel-cuda] runs-on: [self-hosted, linux, x64] if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main' environment: prod @@ -186,11 +288,27 @@ jobs: echo "Total: $(ls -1 dist/*.whl | wc -l) wheels (3 Python versions: 3.10, 3.11, 3.12)" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Features" >> $GITHUB_STEP_SUMMARY - echo "**CPU-only build with multi-variant support:**" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**CPU wheels with multi-variant support:**" >> $GITHUB_STEP_SUMMARY echo "- ✅ AMX (Intel Sapphire Rapids+)" >> $GITHUB_STEP_SUMMARY echo "- ✅ AVX512 (Intel Skylake-X/Ice Lake/Cascade Lake)" >> $GITHUB_STEP_SUMMARY echo "- ✅ AVX2 (Maximum compatibility)" >> $GITHUB_STEP_SUMMARY + echo "- 🔧 Runtime CPU detection: Automatically selects optimal variant" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "**Runtime CPU detection:** Automatically selects the best variant for your CPU" >> $GITHUB_STEP_SUMMARY + echo "**CUDA wheels with multi-architecture support:**" >> $GITHUB_STEP_SUMMARY + echo "- ✅ SM 80 (Ampere: A100, RTX 3000 series)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ SM 86 (Ampere: RTX 3060-3090)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ SM 89 (Ada Lovelace: RTX 4000 series)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ SM 90 (Hopper: H100)" >> $GITHUB_STEP_SUMMARY + echo "- 🔧 Static CUDA runtime: Compatible with CUDA 11.8+ and 12.x drivers" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Installation:**" >> $GITHUB_STEP_SUMMARY + echo '```bash' >> $GITHUB_STEP_SUMMARY + echo "# CPU version" >> $GITHUB_STEP_SUMMARY + echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}+cpu" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "# CUDA version (requires NVIDIA driver with CUDA 11.8+ or 12.x support)" >> $GITHUB_STEP_SUMMARY + echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}+cuda118" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "PyPI link: https://pypi.org/project/kt-kernel/#history" >> $GITHUB_STEP_SUMMARY diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt index 20280a3..de0cdb2 100644 --- a/kt-kernel/CMakeLists.txt +++ b/kt-kernel/CMakeLists.txt @@ -16,6 +16,7 @@ option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, A option(KTRANSFORMERS_USE_CUDA "ktransformers: use CUDA" OFF) option(KTRANSFORMERS_USE_MUSA "ktransformers: use MUSA" OFF) option(KTRANSFORMERS_USE_ROCM "ktransformers: use ROCM" OFF) +option(KTRANSFORMERS_CUDA_STATIC_RUNTIME "ktransformers: statically link CUDA runtime" ON) option(KTRANSFORMERS_CPU_USE_KML "ktransformers: CPU use KML" OFF) option(KTRANSFORMERS_CPU_USE_AMX_AVX512 "ktransformers: CPU use AMX or AVX512" OFF) option(KTRANSFORMERS_CPU_USE_AMX "ktransformers: CPU use AMX" OFF) @@ -415,6 +416,25 @@ if(KTRANSFORMERS_USE_CUDA) message(STATUS "enabling CUDA") enable_language(CUDA) add_compile_definitions(KTRANSFORMERS_USE_CUDA=1) + + # Set default CUDA architectures if not specified + # Target: SM 80/86 (Ampere), 89 (Ada), 90 (Hopper) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90" CACHE STRING "CUDA architectures" FORCE) + message(STATUS "CUDA architectures (default): ${CMAKE_CUDA_ARCHITECTURES}") + else() + message(STATUS "CUDA architectures (user): ${CMAKE_CUDA_ARCHITECTURES}") + endif() + + # Optimization flags + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + + message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}") + message(STATUS "CUDA toolkit: ${CUDAToolkit_VERSION}") + message(STATUS "CUDA flags: ${CMAKE_CUDA_FLAGS}") elseif(KTRANSFORMERS_USE_ROCM) find_package(HIP REQUIRED) if(HIP_FOUND) @@ -629,7 +649,32 @@ endif() if(KTRANSFORMERS_USE_CUDA) - target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so") + # Link CUDA runtime (static or dynamic) + if(KTRANSFORMERS_CUDA_STATIC_RUNTIME) + # Platform-aware static library path + if(WIN32) + set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/cudart_static.lib") + else() + set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/libcudart_static.a") + endif() + + if(EXISTS "${CUDART_STATIC_LIB}") + target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDART_STATIC_LIB}") + message(STATUS "CUDA runtime: static (${CUDART_STATIC_LIB})") + + # Linux needs additional libs for static cudart + if(UNIX AND NOT APPLE) + target_link_libraries(${PROJECT_NAME} PRIVATE rt pthread dl) + endif() + else() + message(WARNING "Static CUDA runtime not found, using dynamic") + target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart) + endif() + else() + # Dynamic linking + target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart) + message(STATUS "CUDA runtime: dynamic") + endif() endif() if(KTRANSFORMERS_USE_ROCM) add_compile_definitions(USE_HIP=1) diff --git a/kt-kernel/README.md b/kt-kernel/README.md index 82ce00b..2a39d31 100644 --- a/kt-kernel/README.md +++ b/kt-kernel/README.md @@ -43,16 +43,18 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo ### Option 1: Install from PyPI (Recommended for Most Users) -Install the latest stable version: +#### CPU-Only Installation + +Install the latest CPU-only version: ```bash -pip install kt-kernel +pip install "kt-kernel==0.5.0+cpu" ``` -Or install a specific version: +Or let pip auto-select the latest CPU version: ```bash -pip install kt-kernel==0.4.3 +pip install kt-kernel # Defaults to CPU version ``` > **Note**: Check the [latest version on PyPI](https://pypi.org/project/kt-kernel/#history) @@ -68,6 +70,43 @@ pip install kt-kernel==0.4.3 - Linux x86-64 (manylinux_2_17 compatible) - CPU with AVX2 support (Intel Haswell 2013+, AMD Zen+) +#### CUDA Installation (GPU Acceleration) + +For NVIDIA GPU-accelerated inference: + +```bash +pip install "kt-kernel==0.5.0+cuda118" +``` + +**Features:** +- ✅ **Multi-architecture support**: Single wheel supports SM 80/86/89/90 (Ampere, Ada, Hopper) +- ✅ **Static CUDA runtime**: No CUDA toolkit installation required +- ✅ **Broad compatibility**: Works with CUDA 11.8+ and 12.x drivers +- ✅ **PyTorch compatible**: Works with any PyTorch CUDA variant (cu118, cu121, cu124) + +**Requirements:** +- Python 3.10, 3.11, or 3.12 +- Linux x86-64 (manylinux_2_17 compatible) +- NVIDIA GPU with compute capability 8.0+ (Ampere or newer) + - ✅ Supported: A100, RTX 3000/4000 series, H100 + - ❌ Not supported: V100, P100, GTX 1000/2000 series (too old) +- NVIDIA driver with CUDA 11.8+ or 12.x support (no CUDA toolkit needed) + +**GPU Compatibility Matrix:** + +| GPU Architecture | Compute Capability | Supported | Example GPUs | +|-----------------|-------------------|-----------|-------------| +| Hopper | 9.0 | ✅ | H100, H200 | +| Ada Lovelace | 8.9 | ✅ | RTX 4090, 4080, 4070 | +| Ampere | 8.6 | ✅ | RTX 3090, 3080, 3070, 3060 | +| Ampere | 8.0 | ✅ | A100, A30 | +| Turing | 7.5 | ❌ | RTX 2080, T4 | +| Volta | 7.0 | ❌ | V100 | + +**CUDA Driver Compatibility:** +- CUDA 11.8, 11.9, 12.0-12.6+: Full support +- CUDA 11.0-11.7: Not supported (use CPU version or upgrade driver) + **CPU Variants Included:** The wheel includes 3 optimized variants that are **automatically selected at runtime** based on your CPU: diff --git a/kt-kernel/setup.py b/kt-kernel/setup.py index b0f17cd..e598c2d 100644 --- a/kt-kernel/setup.py +++ b/kt-kernel/setup.py @@ -610,6 +610,9 @@ class CMakeBuild(build_ext): _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS") _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE") + # CUDA static runtime toggle + _forward_bool_env(cmake_args, "CPUINFER_CUDA_STATIC_RUNTIME", "KTRANSFORMERS_CUDA_STATIC_RUNTIME") + # GPU backends (mutually exclusive expected) if _env_get_bool("CPUINFER_USE_CUDA", False): cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON") @@ -632,11 +635,11 @@ class CMakeBuild(build_ext): hostcxx = os.environ["CUDAHOSTCXX"] cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}") print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}") - # Respect user-provided architectures only (no default auto-detection). - archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip() + # Set CUDA architectures (default: Ampere/Ada/Hopper) + archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "80;86;89;90").strip() if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args): cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}") - print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}") + print(f"-- Set CUDA architectures: {archs_env}") if _env_get_bool("CPUINFER_USE_ROCM", False): cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON") if _env_get_bool("CPUINFER_USE_MUSA", False): @@ -685,15 +688,34 @@ class CMakeBuild(build_ext): ################################################################################ -# Import version from shared version.py at project root +# Read base version from version.py _version_file = Path(__file__).resolve().parent.parent / "version.py" if _version_file.exists(): _version_ns = {} with open(_version_file, "r", encoding="utf-8") as f: exec(f.read(), _version_ns) - VERSION = os.environ.get("CPUINFER_VERSION", _version_ns.get("__version__", "0.4.2")) + _base_version = _version_ns.get("__version__", "0.5.0") else: - VERSION = os.environ.get("CPUINFER_VERSION", "0.4.2") + _base_version = "0.5.0" + +# Auto-detect version suffix based on build type +if "CPUINFER_VERSION" in os.environ: + # User explicitly set version (e.g., for testing) + VERSION = os.environ["CPUINFER_VERSION"] + print(f"-- Explicit version: {VERSION}") +else: + # Auto-detect suffix based on CUDA usage + cuda_enabled = _env_get_bool("CPUINFER_USE_CUDA", False) + + if cuda_enabled: + # CUDA build: add +cuda118 suffix + # (CUDA 11.8 is the build toolkit version for compatibility with 11.8+ and 12.x) + VERSION = f"{_base_version}+cuda118" + print(f"-- CUDA wheel version: {VERSION}") + else: + # CPU-only build: add +cpu suffix + VERSION = f"{_base_version}+cpu" + print(f"-- CPU wheel version: {VERSION}") ################################################################################ # Setup