update PyPI Install and readme (#1731)

This commit is contained in:
Jianwei Dong
2025-12-18 17:21:47 +08:00
committed by GitHub
parent 3c134359bc
commit 39449ed1af
3 changed files with 208 additions and 43 deletions

View File

@@ -50,59 +50,67 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
### Option 1: Install from PyPI (Recommended for Most Users)
Coming soon...
Choose the version matching your CUDA installation:
Install the latest stable version:
```bash
# For CUDA 11.8
pip install kt-kernel==0.4.2.cu118
# For CUDA 12.1
pip install kt-kernel==0.4.2.cu121
# For CUDA 12.4
pip install kt-kernel==0.4.2.cu124
# For CUDA 12.6
pip install kt-kernel==0.4.2.cu126
pip install kt-kernel
```
> **Note**: Replace `0.4.2` with the [latest version](https://pypi.org/project/kt-kernel/#history) if available.
Or install a specific version:
```bash
pip install kt-kernel==0.4.3
```
> **Note**: Check the [latest version on PyPI](https://pypi.org/project/kt-kernel/#history)
**Features:**
-**Automatic CPU detection**: Detects your CPU and loads the optimal kernel variant
-**Multi-variant wheel**: Includes AMX, AVX512, and AVX2 variants in a single package
-**No compilation needed**: Pre-built wheels for Python 3.10, 3.11, 3.12
-**Multiple CUDA versions**: Choose the version matching your environment
-**Universal compatibility**: Works on any x86-64 Linux system (2013+)
**Requirements:**
- CUDA 11.8+ or 12.x runtime (must match the package version you install)
- PyTorch 2.0+ (install separately, must match CUDA version)
- Linux x86-64
- Python 3.10, 3.11, or 3.12
- Linux x86-64 (manylinux_2_17 compatible)
- CPU with AVX2 support (Intel Haswell 2013+, AMD Zen+)
**CPU Variants Included:**
| Variant | CPU Support | Use Case |
|---------|-------------|----------|
| **AMX** | Intel Sapphire Rapids+ | Best performance on latest Intel CPUs |
| **AVX512** | Intel Skylake-X/Ice Lake/Cascade Lake | AVX512-capable CPUs without AMX |
| **AVX2** | Intel Haswell+, AMD Zen+ | Maximum compatibility |
**Check which variant is loaded:**
The wheel includes 3 optimized variants that are **automatically selected at runtime** based on your CPU:
| Variant | CPU Support | Performance | Auto-Selected When |
|---------|-------------|-------------|-------------------|
| **AMX** | Intel Sapphire Rapids+ (2023+) | ⚡⚡⚡ Best | AMX instructions detected |
| **AVX512** | Intel Skylake-X/Ice Lake/Cascade Lake (2017+) | ⚡⚡ Great | AVX512 instructions detected |
| **AVX2** | Intel Haswell+ (2013+), AMD Zen+ | ⚡ Good | Fallback for maximum compatibility |
**Verify installation:**
```python
import kt_kernel
# Check which CPU variant was loaded
print(f"CPU variant: {kt_kernel.__cpu_variant__}") # 'amx', 'avx512', or 'avx2'
print(f"Version: {kt_kernel.__version__}")
# Test import
from kt_kernel import KTMoEWrapper
print("✓ kt-kernel installed successfully!")
```
**Environment Variables:**
```bash
# Override automatic CPU detection
export KT_KERNEL_CPU_VARIANT=avx2 # or 'avx512', 'amx'
# Override automatic CPU detection (for testing or debugging)
export KT_KERNEL_CPU_VARIANT=avx2 # Force AVX2 variant (options: 'avx2', 'avx512', 'amx')
# Enable debug output
# Enable debug output to see detection process
export KT_KERNEL_DEBUG=1
python -c "import kt_kernel"
# Output:
# [kt-kernel] Detected AMX support via /proc/cpuinfo
# [kt-kernel] Selected CPU variant: amx
# [kt-kernel] Loading amx from: /path/to/_kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
# [kt-kernel] Successfully loaded AMX variant
```
---

View File

@@ -49,15 +49,32 @@ kt_kernel_ext = _kt_kernel_ext
# Import main API
from .experts import KTMoEWrapper
# Read version from project root version.py
import os
_root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'version.py')
if os.path.exists(_root_version_file):
_version_ns = {}
with open(_root_version_file, 'r', encoding='utf-8') as f:
exec(f.read(), _version_ns)
__version__ = _version_ns.get('__version__', '0.4.2')
else:
__version__ = "0.4.2"
# Read version from package metadata (preferred) or fallback to project root
try:
# Try to get version from installed package metadata (works in installed environment)
from importlib.metadata import version, PackageNotFoundError
try:
__version__ = version('kt-kernel')
except PackageNotFoundError:
# Package not installed, try to read from source tree version.py
import os
_root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'version.py')
if os.path.exists(_root_version_file):
_version_ns = {}
with open(_root_version_file, 'r', encoding='utf-8') as f:
exec(f.read(), _version_ns)
__version__ = _version_ns.get('__version__', '0.4.3')
else:
__version__ = "0.4.3"
except ImportError:
# Python < 3.8, fallback to pkg_resources or hardcoded version
try:
from pkg_resources import get_distribution, DistributionNotFound
try:
__version__ = get_distribution('kt-kernel').version
except DistributionNotFound:
__version__ = "0.4.3"
except ImportError:
__version__ = "0.4.3"
__all__ = ["KTMoEWrapper", "kt_kernel_ext", "__cpu_variant__", "__version__"]

View File

@@ -239,6 +239,151 @@ class CMakeBuild(build_ext):
return info
def build_extension(self, ext: CMakeExtension):
"""
Main entry point for building the extension.
Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
and routes to the appropriate build method.
"""
if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
# Build all 3 variants (AMX, AVX512, AVX2)
self.build_multi_variants(ext)
else:
# Build single variant (original behavior)
self._build_single_variant(ext)
def build_multi_variants(self, ext: CMakeExtension):
"""
Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel.
This creates 3 separate .so files:
- _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
- _kt_kernel_ext_avx512.cpython-311-x86_64-linux-gnu.so
- _kt_kernel_ext_avx2.cpython-311-x86_64-linux-gnu.so
Runtime CPU detection (in _cpu_detect.py) will automatically load the best one.
"""
print("=" * 70)
print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)")
print("=" * 70)
print()
print("This will build three variants in a single wheel:")
print(" - AMX variant (Intel Sapphire Rapids+)")
print(" - AVX512 variant (Intel Skylake-X/Ice Lake+, AMD Zen 4+)")
print(" - AVX2 variant (maximum compatibility, 2013+)")
print()
print("Runtime CPU detection will automatically select the best variant.")
print()
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
# Save original env vars
orig_cpu_instruct = os.environ.get("CPUINFER_CPU_INSTRUCT")
orig_enable_amx = os.environ.get("CPUINFER_ENABLE_AMX")
orig_enable_avx512 = os.environ.get("CPUINFER_ENABLE_AVX512")
# Variant configurations: (name, CPUINFER_CPU_INSTRUCT, CPUINFER_ENABLE_AMX)
variants = [
("amx", "AVX512", "ON"), # AVX512 + AMX
("avx512", "AVX512", "OFF"), # AVX512 only
("avx2", "AVX2", "OFF"), # AVX2 only
]
for variant_name, cpu_instruct, enable_amx in variants:
print("=" * 70)
print(f"Building {variant_name.upper()} variant...")
print("=" * 70)
print()
# Set environment variables for this variant
os.environ["CPUINFER_CPU_INSTRUCT"] = cpu_instruct
os.environ["CPUINFER_ENABLE_AMX"] = enable_amx
if variant_name == "avx2":
# For AVX2 variant, disable AVX512 umbrella to prevent AVX512 code
os.environ["CPUINFER_ENABLE_AVX512"] = "OFF"
else:
# For AMX and AVX512 variants, enable AVX512 umbrella
os.environ["CPUINFER_ENABLE_AVX512"] = "ON"
# Use separate build directory for each variant
build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}_{variant_name}"
build_temp.mkdir(parents=True, exist_ok=True)
# Build this variant
self._build_single_variant_impl(ext, extdir, build_temp, cfg)
# Rename the built .so file to include variant suffix
# Original name: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
# New name: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
built_so_files = list(extdir.glob(f"{ext.name.split('.')[-1]}.*.so"))
if built_so_files:
original_so = built_so_files[0]
# Extract the suffix after the module name
# e.g., "kt_kernel_ext.cpython-311-x86_64-linux-gnu.so" -> ".cpython-311-x86_64-linux-gnu.so"
suffix = original_so.name.replace(ext.name.split(".")[-1], "")
new_name = f"_kt_kernel_ext_{variant_name}{suffix}"
new_path = extdir / new_name
# Remove existing file if present
if new_path.exists():
new_path.unlink()
# Rename
original_so.rename(new_path)
print(f"✓ Built and renamed to: {new_name}")
print()
else:
print(f"⚠ Warning: Could not find built .so file for {variant_name} variant")
print()
# Restore original env vars
if orig_cpu_instruct is not None:
os.environ["CPUINFER_CPU_INSTRUCT"] = orig_cpu_instruct
elif "CPUINFER_CPU_INSTRUCT" in os.environ:
del os.environ["CPUINFER_CPU_INSTRUCT"]
if orig_enable_amx is not None:
os.environ["CPUINFER_ENABLE_AMX"] = orig_enable_amx
elif "CPUINFER_ENABLE_AMX" in os.environ:
del os.environ["CPUINFER_ENABLE_AMX"]
if orig_enable_avx512 is not None:
os.environ["CPUINFER_ENABLE_AVX512"] = orig_enable_avx512
elif "CPUINFER_ENABLE_AVX512" in os.environ:
del os.environ["CPUINFER_ENABLE_AVX512"]
print("=" * 70)
print("✓ All variants built successfully!")
print("=" * 70)
print()
print("The wheel now contains 3 CPU variants:")
for so_file in sorted(extdir.glob("_kt_kernel_ext_*.so")):
print(f" - {so_file.name}")
print()
def _build_single_variant(self, ext: CMakeExtension):
"""Original single-variant build logic - wrapper for backward compatibility."""
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}"
build_temp.mkdir(parents=True, exist_ok=True)
self._build_single_variant_impl(ext, extdir, build_temp, cfg)
def _build_single_variant_impl(self, ext: CMakeExtension, extdir: Path, build_temp: Path, cfg: str):
"""
Core build logic for a single variant.
This method contains the actual CMake configuration and build steps.
It's called by both _build_single_variant() and build_multi_variants().
Args:
ext: The CMakeExtension to build
extdir: Directory where the .so file should be placed
build_temp: Temporary build directory for CMake
cfg: Build type (Release/Debug/etc.)
"""
# Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
def detect_cuda_toolkit() -> bool:
# Respect CUDA_HOME
@@ -287,11 +432,6 @@ class CMakeBuild(build_ext):
os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}"
build_temp.mkdir(parents=True, exist_ok=True)
# Base CMake args
cmake_args = [
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/",