#!/usr/bin/env python # -*- coding: utf-8 -*- """ Lightweight packaging script for building and distributing kt-kernel, a high-performance kernel operations library for KTransformers. pip install kt-kernel >>> from kt_kernel import AMXMoEWrapper This script drives your existing CMake build (root `CMakeLists.txt`) and only needs a working C++ toolchain, CMake (>=3.16), and pybind11 (vendored already in the repo). Environment knobs (export before running pip install .): CPUINFER_FORCE_REBUILD=1 Always rebuild (ignore any cached build) CPUINFER_BUILD_TYPE=Release Debug / RelWithDebInfo / Release CPUINFER_PARALLEL=8 Parallel build jobs (auto = detected cores) CPUINFER_CPU_INSTRUCT=FANCY One of: NATIVE|FANCY|AVX512|AVX2 (maps to CMake flags) CPUINFER_ENABLE_AMX=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX CPUINFER_ENABLE_MLA=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MLA CPUINFER_ENABLE_BLIS=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD CPUINFER_ENABLE_KML=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML CPUINFER_ENABLE_AVX512=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512 CPUINFER_BLIS_ROOT=/path/to/blis Forward to -DBLIS_ROOT CPUINFER_ENABLE_LTO=ON ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option) CPUINFER_LTO_JOBS=8 Forward to -DCPUINFER_LTO_JOBS CPUINFER_LTO_MODE=auto Forward to -DCPUINFER_LTO_MODE CPUINFER_NATIVE=ON (override LLAMA_NATIVE) GPU backends (if ever added later, keep placeholders): CPUINFER_USE_CUDA=0/1 -DKTRANSFORMERS_USE_CUDA CPUINFER_USE_ROCM=0/1 -DKTRANSFORMERS_USE_ROCM CPUINFER_USE_MUSA=0/1 -DKTRANSFORMERS_USE_MUSA Usage: pip install . Or build wheel: python -m build (if you have build/installed) Resulting wheel exposes a top-level package `kt_kernel` with AMXMoEWrapper and other kernel wrappers. """ from __future__ import annotations import os import re import sys import platform import subprocess from pathlib import Path from setuptools import setup, Extension from setuptools.command.build_ext import build_ext import shutil # ------------------------- # Env parsing helpers # ------------------------- def _env_get_bool(name: str, default: bool | None = None) -> bool | None: v = os.environ.get(name) if v is None: return default val = v.strip().lower() if val in ("1", "on", "true", "yes", "y", "enable", "enabled"): return True if val in ("0", "off", "false", "no", "n", "disable", "disabled"): return False return default def _cmake_onoff(flag: bool) -> str: return "ON" if flag else "OFF" def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool: """If env exists, forward it to CMake as -D=ON/OFF and return True; else return False.""" b = _env_get_bool(env_name, None) if b is None: return False cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}") print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}") return True def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool: v = os.environ.get(env_name) if not v: return False cmake_args.append(f"-D{cmake_flag}={v}") print(f"-- Forward {env_name} -> -D{cmake_flag}={v}") return True ################################################################################ # Helpers ################################################################################ REPO_ROOT = Path(__file__).parent.resolve() CPU_FEATURE_MAP = { "FANCY": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON", "AVX512": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON", "AVX2": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON", "NATIVE": "-DLLAMA_NATIVE=ON", } def default_build_type() -> str: return os.environ.get("CPUINFER_BUILD_TYPE", "Release") def detect_parallel_jobs() -> str: if "CPUINFER_PARALLEL" in os.environ: return os.environ["CPUINFER_PARALLEL"] try: import multiprocessing return str(multiprocessing.cpu_count()) except Exception: return "1" def cpu_feature_flags() -> list[str]: mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper() return [tok for tok in CPU_FEATURE_MAP.get(mode, CPU_FEATURE_MAP["NATIVE"]).split() if tok] ################################################################################ # CMakeExtension + builder ################################################################################ class CMakeExtension(Extension): def __init__(self, name: str, sourcedir: str = ""): super().__init__(name, sources=[]) self.sourcedir = str(Path(sourcedir).resolve()) class CMakeBuild(build_ext): def run(self): # Ensure CMake present try: subprocess.run(["cmake", "--version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: # pragma: no cover raise RuntimeError("CMake is required to build this project") from e super().run() def detect_cpu_info(self) -> dict: """Detect CPU vendor/arch and instruction set features. Returns a dict like: { 'vendor': 'intel'|'amd'|'arm'|'unknown', 'arch': platform.machine().lower(), 'features': set(['AVX2','AVX512','AMX']), 'raw': { 'flags': set([...]) } } """ info = { "vendor": "unknown", "arch": platform.machine().lower(), "features": set(), "raw": {"flags": set()}, } try: sysname = platform.system() if sysname == "Linux": with open("/proc/cpuinfo", "r", encoding="utf-8", errors="ignore") as f: cpuinfo = f.read() low = cpuinfo.lower() # vendor if "vendor_id" in low: # Typical x86 linux m = re.search(r"vendor_id\s*:\s*(\S+)", cpuinfo) if m: v = m.group(1).lower() if "genuineintel" in v: info["vendor"] = "intel" elif "authenticamd" in v: info["vendor"] = "amd" # ARM sometimes has 'model name' or 'Hardware' if info["vendor"] == "unknown": if any(tok in low for tok in ["aarch64", "armv8", "arm cortex", "kunpeng", "kirin", "huawei"]): info["vendor"] = "arm" # flags collection (x86 uses 'flags', arm uses 'Features') flags = set() for key in ("flags", "Features", "features"): m = re.search(rf"^{key}\s*:\s*(.+)$", cpuinfo, re.IGNORECASE | re.MULTILINE) if m: flags.update(m.group(1).lower().split()) info["raw"]["flags"] = flags # feature summary if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl", "avx512vnni"]): info["features"].add("AVX512") if "avx2" in flags or "avx2" in low: info["features"].add("AVX2") # AMX flags on Linux are with underscores; keep hyphen fallback just in case if any( f in flags or f in low for f in ["amx_bf16", "amx_int8", "amx_tile", "amx-bf16", "amx-int8", "amx-tile"] ): info["features"].add("AMX") elif sysname == "Darwin": # macOS: Apple Silicon (arm64) vs Intel arch = platform.machine().lower() info["arch"] = arch if arch in ("arm64", "aarch64"): info["vendor"] = "arm" else: info["vendor"] = "intel" # No AVX/AMX on Apple Silicon; assume none elif sysname == "Windows": # Minimal detection via arch; detailed CPUID omitted for brevity arch = platform.machine().lower() info["arch"] = arch if arch in ("arm64", "aarch64"): info["vendor"] = "arm" else: # Could be Intel or AMD; leave unknown info["vendor"] = "unknown" except Exception as e: print(f"Warning: CPU detection failed: {e}") return info def build_extension(self, ext: CMakeExtension): """ Main entry point for building the extension. Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1) and routes to the appropriate build method. """ if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False): # Build all 3 variants (AMX, AVX512, AVX2) self.build_multi_variants(ext) else: # Build single variant (original behavior) self._build_single_variant(ext) def build_multi_variants(self, ext: CMakeExtension): """ Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel. This method is called when CPUINFER_BUILD_ALL_VARIANTS=1 is set. It builds three separate extensions with different CPU instruction sets and renames the output .so files with variant suffixes. """ print("=" * 80) print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)") print("=" * 80) # Define the 3 variants to build variants = [ { 'name': 'amx', 'env': { 'CPUINFER_CPU_INSTRUCT': 'NATIVE', 'CPUINFER_ENABLE_AMX': 'ON', }, 'description': 'AMX variant (Intel Sapphire Rapids+)' }, { 'name': 'avx512', 'env': { 'CPUINFER_CPU_INSTRUCT': 'AVX512', 'CPUINFER_ENABLE_AMX': 'OFF', }, 'description': 'AVX512 variant (Intel Skylake-X/Ice Lake/Cascade Lake)' }, { 'name': 'avx2', 'env': { 'CPUINFER_CPU_INSTRUCT': 'AVX2', 'CPUINFER_ENABLE_AMX': 'OFF', }, 'description': 'AVX2 variant (maximum compatibility)' } ] # Save original environment original_env = os.environ.copy() extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve() for i, variant in enumerate(variants, 1): print(f"\n{'=' * 80}") print(f"Building variant {i}/3: {variant['description']}") print(f"{'=' * 80}\n") # Set variant-specific environment variables os.environ.update(variant['env']) # Use a unique build directory for this variant original_build_temp = self.build_temp self.build_temp = str(Path(self.build_temp) / f"variant_{variant['name']}") try: # Build this variant (calls the single-variant build logic) self._build_single_variant(ext) # Rename the generated .so file to include variant suffix # Original: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so # Renamed: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so # Extract the base extension name (without package prefix) # ext.name is "kt_kernel.kt_kernel_ext", we want "kt_kernel_ext" base_ext_name = ext.name.split('.')[-1] # Find the newly built .so file import time time.sleep(0.5) # Give filesystem time to sync built_candidates = [ f for f in Path(extdir).glob("*.so") if f.name.startswith(base_ext_name) and not f.name.startswith(f"_{base_ext_name}_") ] if not built_candidates: print(f"WARNING: No .so file found for {base_ext_name} in {extdir}") print(f"Files in {extdir}:") for f in Path(extdir).glob("*.so"): print(f" {f.name}") for so_file in built_candidates: # Extract the python tag part (e.g., ".cpython-311-x86_64-linux-gnu.so") suffix = so_file.name.replace(base_ext_name, "") new_name = f"_{base_ext_name}_{variant['name']}{suffix}" new_path = extdir / new_name print(f"-- Renaming {so_file.name} -> {new_name}") if new_path.exists(): print(f" WARNING: Target file already exists, removing: {new_path}") new_path.unlink() so_file.rename(new_path) print(f" ✓ Successfully renamed to {new_name}") finally: # Restore build_temp for next iteration self.build_temp = original_build_temp # Restore original environment os.environ.clear() os.environ.update(original_env) print(f"\n{'=' * 80}") print("✓ Successfully built all 3 CPU variants") print(f"{'=' * 80}\n") def _build_single_variant(self, ext: CMakeExtension): """ Build a single CPU variant. This contains the core build logic extracted from the original build_extension method. """ # Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA def detect_cuda_toolkit() -> bool: # Respect CUDA_HOME cuda_home = os.environ.get("CUDA_HOME") if cuda_home: nvcc_path = Path(cuda_home) / "bin" / "nvcc" if nvcc_path.exists(): return True # PATH lookup if shutil.which("nvcc") is not None: return True # Common default install prefix if Path("/usr/local/cuda/bin/nvcc").exists(): return True return False # Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER) def find_nvcc_path() -> str | None: cuda_home = os.environ.get("CUDA_HOME") if cuda_home: cand = Path(cuda_home) / "bin" / "nvcc" if cand.exists(): return str(cand) which_nvcc = shutil.which("nvcc") if which_nvcc: return which_nvcc # Common fallbacks (ordered by preference) for cand in [ "/usr/local/cuda-12.6/bin/nvcc", "/usr/local/cuda/bin/nvcc", "/usr/bin/nvcc", "/usr/lib/nvidia-cuda-toolkit/bin/nvcc", ]: if Path(cand).exists(): return cand return None # Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default. # If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS # (e.g. "89" or "86;89") or pass it via CMAKE_ARGS. auto_moe_kernel_ = False # Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None) if cuda_env is None: auto_cuda = detect_cuda_toolkit() os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0" print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}") elif cuda_env: print("-- CPUINFER_USE_CUDA explicitly enabled") else: print("-- CPUINFER_USE_CUDA explicitly disabled") extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve() cfg = default_build_type() build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}" build_temp.mkdir(parents=True, exist_ok=True) # Base CMake args cmake_args = [ f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/", f"-DPYTHON_EXECUTABLE={sys.executable}", f"-DCMAKE_BUILD_TYPE={cfg}", ] # CPU feature flags mapping: if user specified CPUINFER_CPU_INSTRUCT, honor it; # else auto-pick based on detection (x86 only) cmake_args += cpu_feature_flags() d = self.detect_cpu_info() print(f"Detected CPU info: {d}") cpu_mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper() # Vendor / feature specific toggles # AMD MoE: explicit env overrides; otherwise default ON on AMD CPU _forward_bool_env(cmake_args, "CPUINFER_ENABLE_BLIS", "KTRANSFORMERS_CPU_MOE_AMD") # if d.get("vendor") == "amd": # auto_moe_kernel_ = True # cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON") # print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)") # _forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT") # KML: explicit env overrides; otherwise default ON on ARM _forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML") # if d.get("vendor") == "arm": # auto_moe_kernel_ = True # cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON") # print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)") # AMX: explicit env overrides; else enable if detected if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"): if "AMX" in d["features"]: cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON") print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)") # AVX512 umbrella (AMX/AVX512 kernels): # - If user explicitly sets CPUINFER_ENABLE_AVX512 -> honor it # - Otherwise, only auto-enable when CPU mode actually wants AVX512 # (NATIVE/FANCY/AVX512). In AVX2 mode we do NOT enable this, so # RAWINT4 / K2 kernels are not compiled. if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"): if cpu_mode in ("NATIVE", "FANCY", "AVX512") and ("AMX" in d["features"] or "AVX512" in d["features"]): cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON") print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)") else: print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella") # Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend # (Do not enable purely on vendor auto-detection to avoid surprise behavior.) amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None) kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None) if amd_env or kml_env: auto_moe_kernel_ = True already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args) if not already_set and auto_moe_kernel_: cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON") print( "-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_BLIS or CPUINFER_ENABLE_KML is ON" ) # Friendly summary print( f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}" ) # MLA toggle (string/boolean allowed) if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"): _forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA") # LTO toggles _forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO") _forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS") _forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE") # GPU backends (mutually exclusive expected) if _env_get_bool("CPUINFER_USE_CUDA", False): cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON") print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)") # Inject nvcc compiler path automatically unless user already specified one. user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args) if not user_specified_compiler: extra_env = os.environ.get("CMAKE_ARGS", "") if "CMAKE_CUDA_COMPILER" in extra_env: user_specified_compiler = True if not user_specified_compiler: nvcc_path = find_nvcc_path() if nvcc_path: cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}") print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)") else: print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.") # Optional host compiler for nvcc if user set CUDAHOSTCXX if os.environ.get("CUDAHOSTCXX"): hostcxx = os.environ["CUDAHOSTCXX"] cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}") print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}") # Respect user-provided architectures only (no default auto-detection). archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "").strip() if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args): cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}") print(f"-- Set CUDA architectures from CPUINFER_CUDA_ARCHS: {archs_env}") if _env_get_bool("CPUINFER_USE_ROCM", False): cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON") if _env_get_bool("CPUINFER_USE_MUSA", False): cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON") # Respect user extra CMAKE_ARGS (space separated) extra = os.environ.get("CMAKE_ARGS") if extra: cmake_args += [a for a in extra.split() if a] # Force rebuild? (delete cache) if _env_get_bool("CPUINFER_FORCE_REBUILD", True): cache = build_temp / "CMakeCache.txt" if cache.exists(): cache.unlink() print("-- CMake configure args:") for a in cmake_args: print(" ", a) # Configure subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True) # Build build_args = ["--build", ".", "--config", cfg] jobs = detect_parallel_jobs() if jobs: build_args += ["--parallel", jobs] print("-- CMake build args:", " ".join(build_args)) subprocess.run(["cmake", *build_args], cwd=build_temp, check=True) # On some systems LTO + CMake + pybind may place the built .so inside build tree; move if needed built_candidates = list(build_temp.rglob(f"{ext.name}*.so")) for cand in built_candidates: if cand.parent != extdir: target = extdir / cand.name target.parent.mkdir(parents=True, exist_ok=True) # Overwrite stale if not target.exists() or target.stat().st_mtime < cand.stat().st_mtime: print(f"-- Copying {cand} -> {target}") target.write_bytes(cand.read_bytes()) ################################################################################ # Version (simple). If you later add a python package dir, you can read from it. ################################################################################ # Import version from shared version.py at project root _version_file = Path(__file__).resolve().parent.parent / "version.py" if _version_file.exists(): _version_ns = {} with open(_version_file, "r", encoding="utf-8") as f: exec(f.read(), _version_ns) VERSION = os.environ.get("CPUINFER_VERSION", _version_ns.get("__version__", "0.4.2")) else: VERSION = os.environ.get("CPUINFER_VERSION", "0.4.2") ################################################################################ # Setup ################################################################################ setup( name="kt-kernel", version=VERSION, description="KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)", author="kvcache-ai", license="Apache-2.0", python_requires=">=3.8", packages=["kt_kernel", "kt_kernel.utils"], package_dir={ "kt_kernel": "python", "kt_kernel.utils": "python/utils", }, ext_modules=[CMakeExtension("kt_kernel.kt_kernel_ext", str(REPO_ROOT))], cmdclass={"build_ext": CMakeBuild}, zip_safe=False, classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: C++", "Operating System :: POSIX :: Linux", "Operating System :: MacOS", ], )