Files
ktransformers/kt-kernel/setup.py
2026-01-28 16:31:56 +08:00

762 lines
32 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Lightweight packaging script for building and distributing kt-kernel,
a high-performance kernel operations library for KTransformers.
pip install kt-kernel
>>> from kt_kernel import AMXMoEWrapper
This script drives your existing CMake build (root `CMakeLists.txt`) and
only needs a working C++ toolchain, CMake (>=3.16), and pybind11 (vendored
already in the repo).
Environment knobs (export before running pip install .):
CPUINFER_FORCE_REBUILD=1 Always rebuild (ignore any cached build)
CPUINFER_BUILD_TYPE=Release Debug / RelWithDebInfo / Release
CPUINFER_PARALLEL=8 Parallel build jobs (auto = detected cores)
CPUINFER_CPU_INSTRUCT=FANCY One of: NATIVE|FANCY|AVX512|AVX2 (maps to CMake flags)
CPUINFER_ENABLE_AMX=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX
CPUINFER_ENABLE_MLA=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MLA
CPUINFER_ENABLE_BLIS=OFF ON/OFF -> -DKTRANSFORMERS_CPU_MOE_AMD
CPUINFER_ENABLE_KML=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_KML
CPUINFER_ENABLE_AVX512=OFF ON/OFF -> -DKTRANSFORMERS_CPU_USE_AMX_AVX512
CPUINFER_ENABLE_AVX512_VNNI=OFF ON/OFF -> -DLLAMA_AVX512_VNNI
CPUINFER_ENABLE_AVX512_BF16=OFF ON/OFF -> -DLLAMA_AVX512_BF16
CPUINFER_ENABLE_AVX512_VBMI=OFF ON/OFF -> -DLLAMA_AVX512_VBMI (required for FP8 MoE)
CPUINFER_BLIS_ROOT=/path/to/blis Forward to -DBLIS_ROOT
CPUINFER_ENABLE_LTO=ON ON/OFF -> -DCPUINFER_ENABLE_LTO (your added option)
CPUINFER_LTO_JOBS=8 Forward to -DCPUINFER_LTO_JOBS
CPUINFER_LTO_MODE=auto Forward to -DCPUINFER_LTO_MODE
CPUINFER_NATIVE=ON (override LLAMA_NATIVE)
GPU backends (if ever added later, keep placeholders):
CPUINFER_USE_CUDA=0/1 -DKTRANSFORMERS_USE_CUDA
CPUINFER_USE_ROCM=0/1 -DKTRANSFORMERS_USE_ROCM
CPUINFER_USE_MUSA=0/1 -DKTRANSFORMERS_USE_MUSA
Usage:
pip install .
Or build wheel:
python -m build (if you have build/installed)
Resulting wheel exposes a top-level package `kt_kernel` with AMXMoEWrapper and other kernel wrappers.
"""
from __future__ import annotations
import os
import re
import sys
import platform
import subprocess
from pathlib import Path
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext
import shutil
# -------------------------
# Env parsing helpers
# -------------------------
def _env_get_bool(name: str, default: bool | None = None) -> bool | None:
v = os.environ.get(name)
if v is None:
return default
val = v.strip().lower()
if val in ("1", "on", "true", "yes", "y", "enable", "enabled"):
return True
if val in ("0", "off", "false", "no", "n", "disable", "disabled"):
return False
return default
def _cmake_onoff(flag: bool) -> str:
return "ON" if flag else "OFF"
def _forward_bool_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
"""If env exists, forward it to CMake as -D<flag>=ON/OFF and return True; else return False."""
b = _env_get_bool(env_name, None)
if b is None:
return False
cmake_args.append(f"-D{cmake_flag}={_cmake_onoff(b)}")
print(f"-- Forward {env_name} -> -D{cmake_flag}={_cmake_onoff(b)}")
return True
def _forward_str_env(cmake_args: list[str], env_name: str, cmake_flag: str) -> bool:
v = os.environ.get(env_name)
if not v:
return False
cmake_args.append(f"-D{cmake_flag}={v}")
print(f"-- Forward {env_name} -> -D{cmake_flag}={v}")
return True
################################################################################
# Helpers
################################################################################
REPO_ROOT = Path(__file__).parent.resolve()
CPU_FEATURE_MAP = {
"FANCY": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON",
"AVX512": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON",
"AVX2": "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON",
"NATIVE": "-DLLAMA_NATIVE=ON",
}
def default_build_type() -> str:
return os.environ.get("CPUINFER_BUILD_TYPE", "Release")
def detect_parallel_jobs() -> str:
if "CPUINFER_PARALLEL" in os.environ:
return os.environ["CPUINFER_PARALLEL"]
try:
import multiprocessing
return str(multiprocessing.cpu_count())
except Exception:
return "1"
def cpu_feature_flags() -> list[str]:
mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper()
return [tok for tok in CPU_FEATURE_MAP.get(mode, CPU_FEATURE_MAP["NATIVE"]).split() if tok]
################################################################################
# CMakeExtension + builder
################################################################################
class CMakeExtension(Extension):
def __init__(self, name: str, sourcedir: str = ""):
super().__init__(name, sources=[])
self.sourcedir = str(Path(sourcedir).resolve())
class CMakeBuild(build_ext):
def run(self):
# Ensure CMake present
try:
subprocess.run(["cmake", "--version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e: # pragma: no cover
raise RuntimeError("CMake is required to build this project") from e
super().run()
def detect_cpu_info(self) -> dict:
"""Detect CPU vendor/arch and instruction set features.
Returns a dict like:
{
'vendor': 'intel'|'amd'|'arm'|'unknown',
'arch': platform.machine().lower(),
'features': set(['AVX2','AVX512','AMX']),
'raw': { 'flags': set([...]) }
}
"""
info = {
"vendor": "unknown",
"arch": platform.machine().lower(),
"features": set(),
"raw": {"flags": set()},
}
try:
sysname = platform.system()
if sysname == "Linux":
with open("/proc/cpuinfo", "r", encoding="utf-8", errors="ignore") as f:
cpuinfo = f.read()
low = cpuinfo.lower()
# vendor
if "vendor_id" in low:
# Typical x86 linux
m = re.search(r"vendor_id\s*:\s*(\S+)", cpuinfo)
if m:
v = m.group(1).lower()
if "genuineintel" in v:
info["vendor"] = "intel"
elif "authenticamd" in v:
info["vendor"] = "amd"
# ARM sometimes has 'model name' or 'Hardware'
if info["vendor"] == "unknown":
if any(tok in low for tok in ["aarch64", "armv8", "arm cortex", "kunpeng", "kirin", "huawei"]):
info["vendor"] = "arm"
# flags collection (x86 uses 'flags', arm uses 'Features')
flags = set()
for key in ("flags", "Features", "features"):
m = re.search(rf"^{key}\s*:\s*(.+)$", cpuinfo, re.IGNORECASE | re.MULTILINE)
if m:
flags.update(m.group(1).lower().split())
info["raw"]["flags"] = flags
# feature summary
if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl"]):
info["features"].add("AVX512")
if "avx2" in flags or "avx2" in low:
info["features"].add("AVX2")
# AMX flags on Linux are with underscores; keep hyphen fallback just in case
if any(
f in flags or f in low
for f in ["amx_bf16", "amx_int8", "amx_tile", "amx-bf16", "amx-int8", "amx-tile"]
):
info["features"].add("AMX")
# Fine-grained AVX512 subset detection
if any(f in flags for f in ["avx512_vnni", "avx512vnni"]):
info["features"].add("AVX512_VNNI")
if any(f in flags for f in ["avx512_bf16", "avx512bf16"]):
info["features"].add("AVX512_BF16")
if any(f in flags for f in ["avx512_vbmi", "avx512vbmi"]):
info["features"].add("AVX512_VBMI")
if any(f in flags for f in ["avx512_vpopcntdq", "avx512vpopcntdq"]):
info["features"].add("AVX512_VPOPCNTDQ")
elif sysname == "Darwin":
# macOS: Apple Silicon (arm64) vs Intel
arch = platform.machine().lower()
info["arch"] = arch
if arch in ("arm64", "aarch64"):
info["vendor"] = "arm"
else:
info["vendor"] = "intel"
# No AVX/AMX on Apple Silicon; assume none
elif sysname == "Windows":
# Minimal detection via arch; detailed CPUID omitted for brevity
arch = platform.machine().lower()
info["arch"] = arch
if arch in ("arm64", "aarch64"):
info["vendor"] = "arm"
else:
# Could be Intel or AMD; leave unknown
info["vendor"] = "unknown"
except Exception as e:
print(f"Warning: CPU detection failed: {e}")
return info
def build_extension(self, ext: CMakeExtension):
"""
Main entry point for building the extension.
Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
and routes to the appropriate build method.
"""
if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
# Build all 3 variants (AMX, AVX512, AVX2)
self.build_multi_variants(ext)
else:
# Build single variant (original behavior)
self._build_single_variant(ext)
def build_multi_variants(self, ext: CMakeExtension):
"""
Build all 6 CPU variants with progressive AVX512 capabilities.
This creates 6 separate .so files optimized for different CPU generations:
- _kt_kernel_ext_avx2.so (Haswell+, 2013)
- _kt_kernel_ext_avx512_base.so (Skylake-X+, 2017)
- _kt_kernel_ext_avx512_vnni.so (Cascade Lake+, 2019)
- _kt_kernel_ext_avx512_vbmi.so (Ice Lake client, 2019)
- _kt_kernel_ext_avx512_bf16.so (Ice Lake server/Zen 4+, 2021)
- _kt_kernel_ext_amx.so (Sapphire Rapids+, 2023)
Runtime CPU detection (in _cpu_detect.py) will automatically select the best match.
"""
print("=" * 70)
print("Building kt-kernel with ALL 6 CPU variants")
print("=" * 70)
print()
print("This will build six progressive variants in a single wheel:")
print(" 1. AVX2 - Haswell+ (2013)")
print(" 2. AVX512 Base - Skylake-X+ (2017)")
print(" 3. AVX512+VNNI - Cascade Lake+ (2019)")
print(" 4. AVX512+VBMI - Ice Lake client (2019)")
print(" 5. AVX512+BF16 - Ice Lake server, Zen 4+ (2021)")
print(" 6. AMX - Sapphire Rapids+ (2023)")
print()
print("Runtime CPU detection will automatically select the best variant.")
print()
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
# Save original env vars to restore later
env_backup = {
"CPUINFER_CPU_INSTRUCT": os.environ.get("CPUINFER_CPU_INSTRUCT"),
"CPUINFER_ENABLE_AMX": os.environ.get("CPUINFER_ENABLE_AMX"),
"CPUINFER_ENABLE_AVX512": os.environ.get("CPUINFER_ENABLE_AVX512"),
"CPUINFER_ENABLE_AVX512_VNNI": os.environ.get("CPUINFER_ENABLE_AVX512_VNNI"),
"CPUINFER_ENABLE_AVX512_BF16": os.environ.get("CPUINFER_ENABLE_AVX512_BF16"),
"CPUINFER_ENABLE_AVX512_VBMI": os.environ.get("CPUINFER_ENABLE_AVX512_VBMI"),
}
# Variant configurations: (name, description, env_vars)
# Each variant specifies exactly which features to enable
variants = [
(
"avx2",
"AVX2 baseline",
{
"CPUINFER_CPU_INSTRUCT": "AVX2",
"CPUINFER_ENABLE_AVX512": "OFF",
"CPUINFER_ENABLE_AMX": "OFF",
},
),
(
"avx512_base",
"AVX512F+BW",
{
"CPUINFER_CPU_INSTRUCT": "AVX512",
"CPUINFER_ENABLE_AVX512": "ON",
"CPUINFER_ENABLE_AVX512_VNNI": "OFF",
"CPUINFER_ENABLE_AVX512_BF16": "OFF",
"CPUINFER_ENABLE_AVX512_VBMI": "OFF",
"CPUINFER_ENABLE_AMX": "OFF",
},
),
(
"avx512_vnni",
"AVX512F+VNNI",
{
"CPUINFER_CPU_INSTRUCT": "AVX512",
"CPUINFER_ENABLE_AVX512": "ON",
"CPUINFER_ENABLE_AVX512_VNNI": "ON",
"CPUINFER_ENABLE_AVX512_BF16": "OFF",
"CPUINFER_ENABLE_AVX512_VBMI": "OFF",
"CPUINFER_ENABLE_AMX": "OFF",
},
),
(
"avx512_vbmi",
"AVX512F+VNNI+VBMI",
{
"CPUINFER_CPU_INSTRUCT": "AVX512",
"CPUINFER_ENABLE_AVX512": "ON",
"CPUINFER_ENABLE_AVX512_VNNI": "ON",
"CPUINFER_ENABLE_AVX512_BF16": "OFF",
"CPUINFER_ENABLE_AVX512_VBMI": "ON",
"CPUINFER_ENABLE_AMX": "OFF",
},
),
(
"avx512_bf16",
"AVX512 Full (F+VNNI+VBMI+BF16)",
{
"CPUINFER_CPU_INSTRUCT": "AVX512",
"CPUINFER_ENABLE_AVX512": "ON",
"CPUINFER_ENABLE_AVX512_VNNI": "ON",
"CPUINFER_ENABLE_AVX512_BF16": "ON",
"CPUINFER_ENABLE_AVX512_VBMI": "ON",
"CPUINFER_ENABLE_AMX": "OFF",
},
),
(
"amx",
"AMX + AVX512 Full",
{
"CPUINFER_CPU_INSTRUCT": "AVX512",
"CPUINFER_ENABLE_AVX512": "ON",
"CPUINFER_ENABLE_AVX512_VNNI": "ON",
"CPUINFER_ENABLE_AVX512_BF16": "ON",
"CPUINFER_ENABLE_AVX512_VBMI": "ON",
"CPUINFER_ENABLE_AMX": "ON",
},
),
]
for variant_name, variant_desc, env_vars in variants:
print("=" * 70)
print(f"Building {variant_name.upper()} variant ({variant_desc})")
print("=" * 70)
print()
# Set environment variables for this variant
for key, value in env_vars.items():
os.environ[key] = value
print(f" {key} = {value}")
# Use separate build directory for each variant
build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}_{variant_name}"
build_temp.mkdir(parents=True, exist_ok=True)
# Build this variant
self._build_single_variant_impl(ext, extdir, build_temp, cfg)
# Rename the built .so file to include variant suffix
# Original name: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
# New name: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
built_so_files = list(extdir.glob(f"{ext.name.split('.')[-1]}.*.so"))
if built_so_files:
original_so = built_so_files[0]
# Extract the suffix after the module name
# e.g., "kt_kernel_ext.cpython-311-x86_64-linux-gnu.so" -> ".cpython-311-x86_64-linux-gnu.so"
suffix = original_so.name.replace(ext.name.split(".")[-1], "")
new_name = f"_kt_kernel_ext_{variant_name}{suffix}"
new_path = extdir / new_name
# Remove existing file if present
if new_path.exists():
new_path.unlink()
# Rename
original_so.rename(new_path)
print(f"✓ Built and renamed to: {new_name}")
print()
else:
print(f"⚠ Warning: Could not find built .so file for {variant_name} variant")
print()
# Restore original env vars
for key, value in env_backup.items():
if value is not None:
os.environ[key] = value
elif key in os.environ:
del os.environ[key]
print("=" * 70)
print("✓ All 6 variants built successfully!")
print("=" * 70)
print()
print("The wheel now contains 6 CPU variants:")
for so_file in sorted(extdir.glob("_kt_kernel_ext_*.so")):
print(f" - {so_file.name}")
print()
def _build_single_variant(self, ext: CMakeExtension):
"""Original single-variant build logic - wrapper for backward compatibility."""
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
build_temp = Path(self.build_temp) / f"{ext.name}_{cfg}"
build_temp.mkdir(parents=True, exist_ok=True)
self._build_single_variant_impl(ext, extdir, build_temp, cfg)
def _build_single_variant_impl(self, ext: CMakeExtension, extdir: Path, build_temp: Path, cfg: str):
"""
Core build logic for a single variant.
This method contains the actual CMake configuration and build steps.
It's called by both _build_single_variant() and build_multi_variants().
Args:
ext: The CMakeExtension to build
extdir: Directory where the .so file should be placed
build_temp: Temporary build directory for CMake
cfg: Build type (Release/Debug/etc.)
"""
# Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
def detect_cuda_toolkit() -> bool:
# Respect CUDA_HOME
cuda_home = os.environ.get("CUDA_HOME")
if cuda_home:
nvcc_path = Path(cuda_home) / "bin" / "nvcc"
if nvcc_path.exists():
return True
# PATH lookup
if shutil.which("nvcc") is not None:
return True
# Common default install prefix
if Path("/usr/local/cuda/bin/nvcc").exists():
return True
return False
# Locate nvcc executable (without forcing user to set -DCMAKE_CUDA_COMPILER)
def find_nvcc_path() -> str | None:
cuda_home = os.environ.get("CUDA_HOME")
if cuda_home:
cand = Path(cuda_home) / "bin" / "nvcc"
if cand.exists():
return str(cand)
which_nvcc = shutil.which("nvcc")
if which_nvcc:
return which_nvcc
# Common fallbacks (ordered by preference)
for cand in [
"/usr/local/cuda-12.6/bin/nvcc",
"/usr/local/cuda/bin/nvcc",
"/usr/bin/nvcc",
"/usr/lib/nvidia-cuda-toolkit/bin/nvcc",
]:
if Path(cand).exists():
return cand
return None
# Note: We no longer set CMAKE_CUDA_ARCHITECTURES by default.
# If users want to specify CUDA archs, they can set env CPUINFER_CUDA_ARCHS
# (e.g. "89" or "86;89") or pass it via CMAKE_ARGS.
auto_moe_kernel_ = False
# Normalize CPUINFER_USE_CUDA: if unset, auto-detect; otherwise respect truthy/falsey values
cuda_env = _env_get_bool("CPUINFER_USE_CUDA", None)
if cuda_env is None:
auto_cuda = detect_cuda_toolkit()
os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
# Base CMake args
cmake_args = [
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}/",
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}",
]
# CPU feature flags mapping: if user specified CPUINFER_CPU_INSTRUCT, honor it;
# else auto-pick based on detection (x86 only)
cmake_args += cpu_feature_flags()
d = self.detect_cpu_info()
print(f"Detected CPU info: {d}")
cpu_mode = os.environ.get("CPUINFER_CPU_INSTRUCT", "NATIVE").upper()
# Vendor / feature specific toggles
# AMD MoE: explicit env overrides; otherwise default ON on AMD CPU
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_BLIS", "KTRANSFORMERS_CPU_MOE_AMD")
# if d.get("vendor") == "amd":
# auto_moe_kernel_ = True
# cmake_args.append("-DKTRANSFORMERS_CPU_MOE_AMD=ON")
# print("-- Detected AMD CPU; enabling AMD MoE kernel (-DKTRANSFORMERS_CPU_MOE_AMD=ON)")
# _forward_str_env(cmake_args, "CPUINFER_BLIS_ROOT", "BLIS_ROOT")
# KML: explicit env overrides; otherwise default ON on ARM
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_KML", "KTRANSFORMERS_CPU_USE_KML")
# if d.get("vendor") == "arm":
# auto_moe_kernel_ = True
# cmake_args.append("-DKTRANSFORMERS_CPU_USE_KML=ON")
# print("-- Detected ARM CPU; enabling KML (-DKTRANSFORMERS_CPU_USE_KML=ON)")
# AMX: explicit env overrides; else enable if detected
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AMX", "KTRANSFORMERS_CPU_USE_AMX"):
if "AMX" in d["features"]:
cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX=ON")
print("-- AMX support detected; enabling (-DKTRANSFORMERS_CPU_USE_AMX=ON)")
# AVX512 umbrella (AMX/AVX512 kernels):
# - If user explicitly sets CPUINFER_ENABLE_AVX512 -> honor it
# - Otherwise, only auto-enable when CPU mode actually wants AVX512
# (NATIVE/FANCY/AVX512). In AVX2 mode we do NOT enable this, so
# RAWINT4 / K2 kernels are not compiled.
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512", "KTRANSFORMERS_CPU_USE_AMX_AVX512"):
if cpu_mode in ("NATIVE", "FANCY", "AVX512") and ("AMX" in d["features"] or "AVX512" in d["features"]):
cmake_args.append("-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON")
print("-- Enabling AMX/AVX512 umbrella (-DKTRANSFORMERS_CPU_USE_AMX_AVX512=ON)")
else:
print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella")
# Fine-grained AVX512 subset flags: only enable if CPU actually supports them
# These are passed to CMake to conditionally add compiler flags
# Track if any AVX512 extension is enabled
avx512_extension_enabled = False
allow_avx512_ext_auto = cpu_mode in ("NATIVE", "FANCY", "AVX512")
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VNNI", "LLAMA_AVX512_VNNI"):
if allow_avx512_ext_auto and "AVX512_VNNI" in d["features"]:
cmake_args.append("-DLLAMA_AVX512_VNNI=ON")
print("-- AVX512_VNNI detected; enabling (-DLLAMA_AVX512_VNNI=ON)")
avx512_extension_enabled = True
else:
avx512_extension_enabled = True
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_BF16", "LLAMA_AVX512_BF16"):
if allow_avx512_ext_auto and "AVX512_BF16" in d["features"]:
cmake_args.append("-DLLAMA_AVX512_BF16=ON")
print("-- AVX512_BF16 detected; enabling (-DLLAMA_AVX512_BF16=ON)")
avx512_extension_enabled = True
else:
avx512_extension_enabled = True
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VBMI", "LLAMA_AVX512_VBMI"):
if allow_avx512_ext_auto and "AVX512_VBMI" in d["features"]:
cmake_args.append("-DLLAMA_AVX512_VBMI=ON")
print("-- AVX512_VBMI detected; enabling (-DLLAMA_AVX512_VBMI=ON)")
avx512_extension_enabled = True
else:
avx512_extension_enabled = True
# If any AVX512 extension is enabled, ensure base AVX512 is also enabled
if avx512_extension_enabled and cpu_mode in ("NATIVE", "FANCY", "AVX512"):
if not any("LLAMA_AVX512=ON" in a for a in cmake_args):
cmake_args.append("-DLLAMA_AVX512=ON")
print("-- AVX512 extensions enabled; also enabling base AVX512F (-DLLAMA_AVX512=ON)")
# Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
# (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None)
kml_env = _env_get_bool("CPUINFER_ENABLE_KML", None)
if amd_env or kml_env:
auto_moe_kernel_ = True
already_set = any("KTRANSFORMERS_CPU_MOE_KERNEL" in a for a in cmake_args)
if not already_set and auto_moe_kernel_:
cmake_args.append("-DKTRANSFORMERS_CPU_MOE_KERNEL=ON")
print(
"-- Auto-enabling MOE kernel (-DKTRANSFORMERS_CPU_MOE_KERNEL=ON) because CPUINFER_ENABLE_BLIS or CPUINFER_ENABLE_KML is ON"
)
# Friendly summary
print(
f"-- CPU detection: vendor={d.get('vendor')} arch={d.get('arch')} features={sorted(list(d.get('features', [])))}"
)
# MLA toggle (string/boolean allowed)
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA"):
_forward_str_env(cmake_args, "CPUINFER_ENABLE_MLA", "KTRANSFORMERS_CPU_MLA")
# LTO toggles
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO")
_forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
_forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")
# CUDA static runtime toggle
_forward_bool_env(cmake_args, "CPUINFER_CUDA_STATIC_RUNTIME", "KTRANSFORMERS_CUDA_STATIC_RUNTIME")
# GPU backends (mutually exclusive expected)
if _env_get_bool("CPUINFER_USE_CUDA", False):
cmake_args.append("-DKTRANSFORMERS_USE_CUDA=ON")
print("-- Enabling CUDA backend (-DKTRANSFORMERS_USE_CUDA=ON)")
# Inject nvcc compiler path automatically unless user already specified one.
user_specified_compiler = any("CMAKE_CUDA_COMPILER" in a for a in cmake_args)
if not user_specified_compiler:
extra_env = os.environ.get("CMAKE_ARGS", "")
if "CMAKE_CUDA_COMPILER" in extra_env:
user_specified_compiler = True
if not user_specified_compiler:
nvcc_path = find_nvcc_path()
if nvcc_path:
cmake_args.append(f"-DCMAKE_CUDA_COMPILER={nvcc_path}")
print(f"-- Auto-detected nvcc: {nvcc_path} (adding -DCMAKE_CUDA_COMPILER)")
else:
print("-- Warning: nvcc not found via CUDA_HOME/PATH/common prefixes; CUDA configure may fail.")
# Optional host compiler for nvcc if user set CUDAHOSTCXX
if os.environ.get("CUDAHOSTCXX"):
hostcxx = os.environ["CUDAHOSTCXX"]
cmake_args.append(f"-DCMAKE_CUDA_HOST_COMPILER={hostcxx}")
print(f"-- Using CUDA host compiler from CUDAHOSTCXX: {hostcxx}")
# Set CUDA architectures (default: Ampere/Ada/Hopper)
archs_env = os.environ.get("CPUINFER_CUDA_ARCHS", "80;86;89;90").strip()
if archs_env and not any("CMAKE_CUDA_ARCHITECTURES" in a for a in cmake_args):
cmake_args.append(f"-DCMAKE_CUDA_ARCHITECTURES={archs_env}")
print(f"-- Set CUDA architectures: {archs_env}")
if _env_get_bool("CPUINFER_USE_ROCM", False):
cmake_args.append("-DKTRANSFORMERS_USE_ROCM=ON")
if _env_get_bool("CPUINFER_USE_MUSA", False):
cmake_args.append("-DKTRANSFORMERS_USE_MUSA=ON")
# Respect user extra CMAKE_ARGS (space separated)
extra = os.environ.get("CMAKE_ARGS")
if extra:
cmake_args += [a for a in extra.split() if a]
# Force rebuild? (delete cache)
if _env_get_bool("CPUINFER_FORCE_REBUILD", True):
cache = build_temp / "CMakeCache.txt"
if cache.exists():
cache.unlink()
print("-- CMake configure args:")
for a in cmake_args:
print(" ", a)
# Configure
subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True)
# Build
build_args = ["--build", ".", "--config", cfg]
jobs = detect_parallel_jobs()
if jobs:
build_args += ["--parallel", jobs]
print("-- CMake build args:", " ".join(build_args))
subprocess.run(["cmake", *build_args], cwd=build_temp, check=True)
# On some systems LTO + CMake + pybind may place the built .so inside build tree; move if needed
built_candidates = list(build_temp.rglob(f"{ext.name}*.so"))
for cand in built_candidates:
if cand.parent != extdir:
target = extdir / cand.name
target.parent.mkdir(parents=True, exist_ok=True)
# Overwrite stale
if not target.exists() or target.stat().st_mtime < cand.stat().st_mtime:
print(f"-- Copying {cand} -> {target}")
target.write_bytes(cand.read_bytes())
################################################################################
# Version (simple). If you later add a python package dir, you can read from it.
################################################################################
# Read base version from version.py
_version_file = Path(__file__).resolve().parent.parent / "version.py"
if _version_file.exists():
_version_ns = {}
with open(_version_file, "r", encoding="utf-8") as f:
exec(f.read(), _version_ns)
_base_version = _version_ns.get("__version__", "0.5.0")
else:
_base_version = "0.5.0"
# Determine version
if "CPUINFER_VERSION" in os.environ:
# User explicitly set version (e.g., for testing)
VERSION = os.environ["CPUINFER_VERSION"]
print(f"-- Explicit version: {VERSION}")
else:
VERSION = _base_version
print(f"-- Version: {VERSION}")
# Package name is always kt-kernel
# The CUDA-enabled wheel includes both CPU multi-variant support and CUDA capabilities
PACKAGE_NAME = "kt-kernel"
cuda_enabled = _env_get_bool("CPUINFER_USE_CUDA", False)
if cuda_enabled:
print(f"-- Building kt-kernel with CUDA support (+ CPU multi-variant)")
else:
print(f"-- Building kt-kernel (CPU-only multi-variant)")
################################################################################
# Setup
################################################################################
setup(
name=PACKAGE_NAME,
version=VERSION,
description="KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)",
author="kvcache-ai",
license="Apache-2.0",
python_requires=">=3.8",
packages=[
"kt_kernel",
"kt_kernel.utils",
"kt_kernel.cli",
"kt_kernel.cli.commands",
"kt_kernel.cli.config",
"kt_kernel.cli.utils",
],
package_dir={
"kt_kernel": "python",
"kt_kernel.utils": "python/utils",
"kt_kernel.cli": "python/cli",
"kt_kernel.cli.commands": "python/cli/commands",
"kt_kernel.cli.config": "python/cli/config",
"kt_kernel.cli.utils": "python/cli/utils",
},
entry_points={
"console_scripts": [
"kt=kt_kernel.cli.main:main",
],
},
ext_modules=[CMakeExtension("kt_kernel.kt_kernel_ext", str(REPO_ROOT))],
cmdclass={"build_ext": CMakeBuild},
zip_safe=False,
classifiers=[
"Programming Language :: Python :: 3",
"Programming Language :: C++",
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS",
],
)