From 559a3ad4ac093d014793d65bea6f9a4eb113f0b9 Mon Sep 17 00:00:00 2001
From: Jianwei Dong <dongjw24@mails.tsinghua.edu.cn>
Date: Mon, 29 Dec 2025 11:19:43 +0800
Subject: [PATCH] fix pypi cuda install (#1763)

---
 .github/workflows/release-pypi.yml | 134 +++++++++++++++++++++++++++--
 kt-kernel/CMakeLists.txt           |  47 +++++++++-
 kt-kernel/README.md                |  47 +++++++++-
 kt-kernel/setup.py                 |  34 ++++++--
 4 files changed, 243 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index 9375d6a..e430e1b 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -89,18 +89,42 @@ jobs:
           pip install auditwheel patchelf
           echo "Repairing wheels for manylinux compatibility..."
           mkdir -p wheelhouse
+
           for wheel in dist/*.whl; do
             echo "Processing $wheel..."
-            auditwheel repair "$wheel" --plat manylinux_2_17_x86_64 -w wheelhouse/ || {
-              echo "Warning: auditwheel repair failed, trying to rename platform tag..."
-              # Fallback: rename the wheel file with manylinux tag
+            success=0
+
+            # Try different manylinux versions (newest to oldest)
+            for plat in manylinux_2_31_x86_64 manylinux_2_28_x86_64 manylinux_2_17_x86_64; do
+              echo "  Trying $plat..."
+              if auditwheel repair "$wheel" --plat "$plat" -w wheelhouse/ 2>&1; then
+                echo "  ✓ Successfully repaired with $plat"
+                success=1
+                break
+              fi
+            done
+
+            # If all auditwheel attempts failed, use rename fallback
+            if [ $success -eq 0 ]; then
+              echo "  Warning: auditwheel repair failed, using rename fallback..."
               wheel_name=$(basename "$wheel")
-              new_name=$(echo "$wheel_name" | sed 's/linux_x86_64/manylinux_2_17_x86_64/')
+              # Use # as sed delimiter to avoid conflict with /
+              new_name=$(echo "$wheel_name" | sed 's#linux_x86_64#manylinux_2_17_x86_64#')
               cp "$wheel" "wheelhouse/$new_name"
-            }
+              echo "  ✓ Renamed to $new_name"
+            fi
           done
+
           echo "Repaired wheels:"
           ls -lh wheelhouse/
+
+          # Verify all wheels contain 3 CPU variants
+          echo "Verifying CPU variants in repaired wheels..."
+          for wheel in wheelhouse/*.whl; do
+            echo "Checking $(basename $wheel):"
+            python -m zipfile -l "$wheel" | grep "\.so" | grep -E "(amx|avx512|avx2)"
+          done
+
           # Replace original wheels with repaired ones
           rm -f dist/*.whl
           cp wheelhouse/*.whl dist/
@@ -112,9 +136,87 @@ jobs:
           path: kt-kernel/dist/*.whl
           retention-days: 7
 
+  build-kt-kernel-cuda:
+    name: Build kt-kernel CUDA (Python ${{ matrix.python-version }})
+    runs-on: [self-hosted, linux, x64, gpu]
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Verify CUDA availability
+        run: |
+          nvidia-smi || (echo "ERROR: GPU not available" && exit 1)
+          nvcc --version || (echo "ERROR: CUDA toolkit not found" && exit 1)
+
+      - name: Install dependencies
+        run: |
+          apt-get update && apt-get install -y cmake libhwloc-dev pkg-config libnuma-dev
+          python -m pip install --upgrade pip
+          pip install build wheel setuptools torch --index-url https://download.pytorch.org/whl/cu118
+
+      - name: Build CUDA wheel
+        working-directory: kt-kernel
+        env:
+          CPUINFER_USE_CUDA: '1'
+          CPUINFER_CUDA_ARCHS: '80;86;89;90'
+          CPUINFER_CUDA_STATIC_RUNTIME: '1'
+          CPUINFER_BUILD_TYPE: 'Release'
+          CPUINFER_PARALLEL: '4'
+          CPUINFER_FORCE_REBUILD: '1'
+          CUDA_HOME: '/usr/local/cuda-11.8'
+        run: |
+          echo "Building CUDA wheel for SM 80, 86, 89, 90"
+          python -m build --wheel -v
+
+      - name: Verify wheel
+        working-directory: kt-kernel
+        run: |
+          ls -lh dist/
+          # Check version suffix
+          [[ $(ls dist/*.whl) == *"+cuda118"* ]] || (echo "ERROR: Missing +cuda118 suffix" && exit 1)
+
+          # Install and test
+          pip install dist/*.whl
+          python -c "import kt_kernel; print(f'Version: {kt_kernel.__version__}')"
+
+          # Verify static linking (should NOT depend on libcudart.so)
+          unzip -q dist/*.whl -d /tmp/check
+          ! ldd /tmp/check/kt_kernel/*.so | grep -q "libcudart.so" || (echo "ERROR: Dynamic cudart found" && exit 1)
+          echo "✓ CUDA runtime statically linked"
+
+      - name: Repair wheel for manylinux
+        working-directory: kt-kernel
+        run: |
+          pip install auditwheel patchelf
+          mkdir -p wheelhouse
+          for wheel in dist/*.whl; do
+            auditwheel repair "$wheel" --plat manylinux_2_17_x86_64 --exclude libcuda.so.1 -w wheelhouse/ || \
+              cp "$wheel" wheelhouse/$(basename "$wheel" | sed 's/linux_x86_64/manylinux_2_17_x86_64/')
+          done
+          rm -f dist/*.whl && cp wheelhouse/*.whl dist/
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: kt-kernel-cuda-wheels-py${{ matrix.python-version }}
+          path: kt-kernel/dist/*.whl
+          retention-days: 7
+
   publish-pypi:
     name: Publish to PyPI
-    needs: build-kt-kernel
+    needs: [build-kt-kernel, build-kt-kernel-cuda]
     runs-on: [self-hosted, linux, x64]
     if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
     environment: prod
@@ -186,11 +288,27 @@ jobs:
           echo "Total: $(ls -1 dist/*.whl | wc -l) wheels (3 Python versions: 3.10, 3.11, 3.12)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "### Features" >> $GITHUB_STEP_SUMMARY
-          echo "**CPU-only build with multi-variant support:**" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**CPU wheels with multi-variant support:**" >> $GITHUB_STEP_SUMMARY
           echo "- ✅ AMX (Intel Sapphire Rapids+)" >> $GITHUB_STEP_SUMMARY
           echo "- ✅ AVX512 (Intel Skylake-X/Ice Lake/Cascade Lake)" >> $GITHUB_STEP_SUMMARY
           echo "- ✅ AVX2 (Maximum compatibility)" >> $GITHUB_STEP_SUMMARY
+          echo "- 🔧 Runtime CPU detection: Automatically selects optimal variant" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "**Runtime CPU detection:** Automatically selects the best variant for your CPU" >> $GITHUB_STEP_SUMMARY
+          echo "**CUDA wheels with multi-architecture support:**" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ SM 80 (Ampere: A100, RTX 3000 series)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ SM 86 (Ampere: RTX 3060-3090)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ SM 89 (Ada Lovelace: RTX 4000 series)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ SM 90 (Hopper: H100)" >> $GITHUB_STEP_SUMMARY
+          echo "- 🔧 Static CUDA runtime: Compatible with CUDA 11.8+ and 12.x drivers" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Installation:**" >> $GITHUB_STEP_SUMMARY
+          echo '```bash' >> $GITHUB_STEP_SUMMARY
+          echo "# CPU version" >> $GITHUB_STEP_SUMMARY
+          echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}+cpu" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "# CUDA version (requires NVIDIA driver with CUDA 11.8+ or 12.x support)" >> $GITHUB_STEP_SUMMARY
+          echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}+cuda118" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "PyPI link: https://pypi.org/project/kt-kernel/#history" >> $GITHUB_STEP_SUMMARY
diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt
index 20280a3..de0cdb2 100644
--- a/kt-kernel/CMakeLists.txt
+++ b/kt-kernel/CMakeLists.txt
@@ -16,6 +16,7 @@ option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, A
 option(KTRANSFORMERS_USE_CUDA "ktransformers: use CUDA" OFF)
 option(KTRANSFORMERS_USE_MUSA "ktransformers: use MUSA" OFF)
 option(KTRANSFORMERS_USE_ROCM "ktransformers: use ROCM" OFF)
+option(KTRANSFORMERS_CUDA_STATIC_RUNTIME "ktransformers: statically link CUDA runtime" ON)
 option(KTRANSFORMERS_CPU_USE_KML "ktransformers: CPU use KML" OFF)
 option(KTRANSFORMERS_CPU_USE_AMX_AVX512 "ktransformers: CPU use AMX or AVX512" OFF)
 option(KTRANSFORMERS_CPU_USE_AMX "ktransformers: CPU use AMX" OFF)
@@ -415,6 +416,25 @@ if(KTRANSFORMERS_USE_CUDA)
     message(STATUS "enabling CUDA")
     enable_language(CUDA)
     add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
+
+    # Set default CUDA architectures if not specified
+    # Target: SM 80/86 (Ampere), 89 (Ada), 90 (Hopper)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90" CACHE STRING "CUDA architectures" FORCE)
+        message(STATUS "CUDA architectures (default): ${CMAKE_CUDA_ARCHITECTURES}")
+    else()
+        message(STATUS "CUDA architectures (user): ${CMAKE_CUDA_ARCHITECTURES}")
+    endif()
+
+    # Optimization flags
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --use_fast_math")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+    message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
+    message(STATUS "CUDA toolkit: ${CUDAToolkit_VERSION}")
+    message(STATUS "CUDA flags: ${CMAKE_CUDA_FLAGS}")
 elseif(KTRANSFORMERS_USE_ROCM)
     find_package(HIP REQUIRED)
     if(HIP_FOUND)
@@ -629,7 +649,32 @@ endif()
 
 
 if(KTRANSFORMERS_USE_CUDA)
-    target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDAToolkit_LIBRARY_DIR}/libcudart.so")
+    # Link CUDA runtime (static or dynamic)
+    if(KTRANSFORMERS_CUDA_STATIC_RUNTIME)
+        # Platform-aware static library path
+        if(WIN32)
+            set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/cudart_static.lib")
+        else()
+            set(CUDART_STATIC_LIB "${CUDAToolkit_LIBRARY_DIR}/libcudart_static.a")
+        endif()
+
+        if(EXISTS "${CUDART_STATIC_LIB}")
+            target_link_libraries(${PROJECT_NAME} PRIVATE "${CUDART_STATIC_LIB}")
+            message(STATUS "CUDA runtime: static (${CUDART_STATIC_LIB})")
+
+            # Linux needs additional libs for static cudart
+            if(UNIX AND NOT APPLE)
+                target_link_libraries(${PROJECT_NAME} PRIVATE rt pthread dl)
+            endif()
+        else()
+            message(WARNING "Static CUDA runtime not found, using dynamic")
+            target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+        endif()
+    else()
+        # Dynamic linking
+        target_link_libraries(${PROJECT_NAME} PRIVATE CUDA::cudart)
+        message(STATUS "CUDA runtime: dynamic")
+    endif()
 endif()
 if(KTRANSFORMERS_USE_ROCM)
     add_compile_definitions(USE_HIP=1)
diff --git a/kt-kernel/README.md b/kt-kernel/README.md
index 82ce00b..2a39d31 100644
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -43,16 +43,18 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
 
 ### Option 1: Install from PyPI (Recommended for Most Users)
 
-Install the latest stable version:
+#### CPU-Only Installation
+
+Install the latest CPU-only version:
 
 ```bash
-pip install kt-kernel
+pip install "kt-kernel==0.5.0+cpu"
 ```
 
-Or install a specific version:
+Or let pip auto-select the latest CPU version:
 
 ```bash
-pip install kt-kernel==0.4.3
+pip install kt-kernel  # Defaults to CPU version
 ```
 
 > **Note**: Check the [latest version on PyPI](https://pypi.org/project/kt-kernel/#history)
@@ -68,6 +70,43 @@ pip install kt-kernel==0.4.3
 - Linux x86-64 (manylinux_2_17 compatible)
 - CPU with AVX2 support (Intel Haswell 2013+, AMD Zen+)
 
+#### CUDA Installation (GPU Acceleration)
+
+For NVIDIA GPU-accelerated inference:
+
+```bash
+pip install "kt-kernel==0.5.0+cuda118"
+```
+
+**Features:**
+- ✅ **Multi-architecture support**: Single wheel supports SM 80/86/89/90 (Ampere, Ada, Hopper)
+- ✅ **Static CUDA runtime**: No CUDA toolkit installation required
+- ✅ **Broad compatibility**: Works with CUDA 11.8+ and 12.x drivers
+- ✅ **PyTorch compatible**: Works with any PyTorch CUDA variant (cu118, cu121, cu124)
+
+**Requirements:**
+- Python 3.10, 3.11, or 3.12
+- Linux x86-64 (manylinux_2_17 compatible)
+- NVIDIA GPU with compute capability 8.0+ (Ampere or newer)
+  - ✅ Supported: A100, RTX 3000/4000 series, H100
+  - ❌ Not supported: V100, P100, GTX 1000/2000 series (too old)
+- NVIDIA driver with CUDA 11.8+ or 12.x support (no CUDA toolkit needed)
+
+**GPU Compatibility Matrix:**
+
+| GPU Architecture | Compute Capability | Supported | Example GPUs |
+|-----------------|-------------------|-----------|-------------|
+| Hopper | 9.0 | ✅ | H100, H200 |
+| Ada Lovelace | 8.9 | ✅ | RTX 4090, 4080, 4070 |
+| Ampere | 8.6 | ✅ | RTX 3090, 3080, 3070, 3060 |
+| Ampere | 8.0 | ✅ | A100, A30 |
+| Turing | 7.5 | ❌ | RTX 2080, T4 |
+| Volta | 7.0 | ❌ | V100 |
+
+**CUDA Driver Compatibility:**
+- CUDA 11.8, 11.9, 12.0-12.6+: Full support
+- CUDA 11.0-11.7: Not supported (use CPU version or upgrade driver)
+
 **CPU Variants Included:**
 
 The wheel includes 3 optimized variants that are **automatically selected at runtime** based on your CPU:
diff --git a/kt-kernel/setup.py b/kt-kernel/setup.py
index b0f17cd..e598c2d 100644
--- a/kt-kernel/setup.py
+++ b/kt-kernel/setup.py
@@ -610,6 +610,9 @@ class CMakeBuild(build_ext):
         _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
         _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")
 
+        # CUDA static runtime toggle
+        _forward_bool_env(cmake_args, "CPUINFER_CUDA_STATIC_RUNTIME", "KTRANSFORMERS_CUDA_STATIC_RUNTIME")
+
         # GPU backends (mutually exclusive expected)
         if _env_get_bool("CPUINFER_USE_CUDA", False):
             cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON")
@@ -632,11 +635,11 @@ class CMakeBuild(build_ext):
                 hostcxx = os.environ["CUDAHOSTCXX"]
                 cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}")
                 print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}")
-            # Respect user-provided architectures only (no default auto-detection).
-            archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip()
+            # Set CUDA architectures (default: Ampere/Ada/Hopper)
+            archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "80;86;89;90").strip()
             if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args):
                 cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}")
-                print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}")
+                print(f"-- Set CUDA architectures: {archs_env}")
         if _env_get_bool("CPUINFER_USE_ROCM", False):
             cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON")
         if _env_get_bool("CPUINFER_USE_MUSA", False):
@@ -685,15 +688,34 @@ class CMakeBuild(build_ext):
 ################################################################################
 
 
-# Import version from shared version.py at project root
+# Read base version from version.py
 _version_file = Path(__file__).resolve().parent.parent / "version.py"
 if _version_file.exists():
     _version_ns = {}
     with open(_version_file, "r", encoding="utf-8") as f:
         exec(f.read(), _version_ns)
-    VERSION = os.environ.get("CPUINFER_VERSION", _version_ns.get("__version__", "0.4.2"))
+    _base_version = _version_ns.get("__version__", "0.5.0")
 else:
-    VERSION = os.environ.get("CPUINFER_VERSION", "0.4.2")
+    _base_version = "0.5.0"
+
+# Auto-detect version suffix based on build type
+if "CPUINFER_VERSION" in os.environ:
+    # User explicitly set version (e.g., for testing)
+    VERSION = os.environ["CPUINFER_VERSION"]
+    print(f"-- Explicit version: {VERSION}")
+else:
+    # Auto-detect suffix based on CUDA usage
+    cuda_enabled = _env_get_bool("CPUINFER_USE_CUDA", False)
+
+    if cuda_enabled:
+        # CUDA build: add +cuda118 suffix
+        # (CUDA 11.8 is the build toolkit version for compatibility with 11.8+ and 12.x)
+        VERSION = f"{_base_version}+cuda118"
+        print(f"-- CUDA wheel version: {VERSION}")
+    else:
+        # CPU-only build: add +cpu suffix
+        VERSION = f"{_base_version}+cpu"
+        print(f"-- CPU wheel version: {VERSION}")
 
 ################################################################################
 # Setup