From d4831473073935305c7bdb0f105a4f92c0350812 Mon Sep 17 00:00:00 2001
From: Jiaqi Liao <30439460+SkqLiao@users.noreply.github.com>
Date: Tue, 11 Nov 2025 19:30:27 +0800
Subject: [PATCH] Fix kt-kernel compile issue (#1595)

* update install.sh

* fix import issue

* update README
---
 kt-kernel/README.md                | 181 +++++++++++++---------
 kt-kernel/install.sh               | 240 ++++++++++++++++++++++++++---
 kt-kernel/pyproject.toml           |   7 +-
 kt-kernel/python/utils/__init__.py |  16 ++
 kt-kernel/setup.py                 |   7 +-
 5 files changed, 357 insertions(+), 94 deletions(-)
 create mode 100644 kt-kernel/python/utils/__init__.py

diff --git a/kt-kernel/README.md b/kt-kernel/README.md
index 8cacb33..3eb3c7d 100644
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -2,6 +2,13 @@
 
 High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, and KML support.
 
+## Note
+
+**Current Support Status:**
+- ✅ **Intel CPUs with AMX**: Fully supported
+- ⚠️ **LLAMAFILE backend**: In preview, not yet fully complete
+- ⚠️ **AMD CPUs with BLIS**: Upcoming, not yet fully integrated
+
 ## Features
 
 - **AMX Optimization**: Intel AMX (Advanced Matrix Extensions) support for INT4/INT8 quantized MoE inference
@@ -11,7 +18,7 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
 - **Async Execution**: Non-blocking `submit_forward` / `sync_forward` API for improved pipelining
 - **Easy Integration**: Clean Python API with automatic backend selection
 
-**Note**: LLAMAFILE backend support is currently in *preview* and not yet fully complete.
+**Note**: *LLAMAFILE backend support is currently in *preview* and not yet fully complete.
 
 ## Installation
 
@@ -22,60 +29,40 @@ First, initialize git submodules:
 git submodule update --init --recursive
 ```
 
-### Standard Installation
+### Quick Installation (Recommended)
+
+The installation script automatically detects your CPU and configures optimal build settings:
+
 ```bash
-pip install .
+# Simple one-command installation (auto-detects CPU)
+./install.sh
 ```
 
-All dependencies (torch, safetensors, compressed-tensors, numpy) will be automatically installed from `pyproject.toml`.
+The installation script will:
+- Auto-detect CPU capabilities (AMX support)
+- Install `cmake` via conda (for the latest version)
+- Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
+
+**What gets configured automatically:**
+- AMX CPU detected → `NATIVE + AMX=ON`
+- No AMX detected → `NATIVE + AMX=OFF`
+
+⚠️ **Important for LLAMAFILE backend users:** If you have an AMX-capable CPU and plan to use the LLAMAFILE backend, do NOT use auto-detection. Use manual mode with `AVX512` or `AVX2` instead of `NATIVE` to avoid compilation issues (see below).
+
+### Manual Configuration (Advanced)
+
+If you need specific build options (e.g., for LLAMAFILE backend, compatibility, or binary distribution):
 
-### Editable Installation (Development)
 ```bash
-pip install -e .
+# Example for LLAMAFILE backend on AMX CPU with AVX512
+export CPUINFER_CPU_INSTRUCT=AVX512  # Options: NATIVE, AVX512, AVX2
+export CPUINFER_ENABLE_AMX=OFF       # Options: ON, OFF
+
+# Run with manual mode
+./install.sh --manual
 ```
 
-### Optional: Pre-install Dependencies
-
-If you encounter network issues or prefer to install dependencies separately, you can optionally use:
-```bash
-pip install -r requirements.txt
-```
-
-**Note**: This step is **optional**. If your environment already has torch and other required packages, you can skip this and directly run `pip install .`
-
-### Error Troubleshooting
-
-#### CUDA Not Found
-
-```
- -- Looking for a CUDA compiler - NOTFOUND
-  CMake Error at CMakeLists.txt:389 (message):
-    KTRANSFORMERS_USE_CUDA=ON but CUDA compiler not found
-```
-
-Make sure you have the CUDA toolkit installed and `nvcc` is in your system PATH.
-
-Try `export CMAKE_ARGS="-D CMAKE_CUDA_COMPILER=$(which nvcc)"` and run `pip install .` again.
-
-#### hwloc Not Found
-
-```
-  -- Could NOT find PkgConfig (missing: PKG_CONFIG_EXECUTABLE)
-  CMake Error at CMakeLists.txt:531 (message):
-    FindHWLOC needs pkg-config program and PKG_CONFIG_PATH must contain the
-    path to hwloc.pc file.
-```
-
-Run `sudo apt install libhwloc-dev` if on a Debian-based system or build from source: https://www.open-mpi.org/projects/hwloc/.
-
-```
-wget https://download.open-mpi.org/release/hwloc/v2.12/hwloc-2.12.2.tar.gz
-tar -xzf hwloc-2.12.2.tar.gz
-cd hwloc-2.12.2
-./configure
-make
-sudo make install
-```
+For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
 
 ## Verification
 
@@ -150,36 +137,92 @@ KTMoEWrapper.clear_buffer_cache()
 
 ## Build Configuration
 
-### CPU Instruction Set Tuning
-```bash
-export CPUINFER_CPU_INSTRUCT=FANCY   # Options: NATIVE|FANCY|AVX512|AVX2
-pip install .
-```
+### Manual Installation
 
-### AMX Configuration
-```bash
-export CPUINFER_ENABLE_AMX=ON        # Enable/disable AMX support
-pip install .
-```
+If you prefer manual installation without the `install.sh` script, follow these steps:
 
-### Build Type
-```bash
-export CPUINFER_BUILD_TYPE=Release   # Debug|RelWithDebInfo|Release
-pip install .
-```
+#### 1. Install System Dependencies
 
-### Parallel Build
-```bash
-export CPUINFER_PARALLEL=8           # Number of parallel jobs
-pip install .
-```
+**Prerequisites:**
+- `cmake` (recommended: `conda install -y cmake`)
+- `libhwloc-dev` and `pkg-config`
+
+#### 2. Set Build Configuration
+
+**Core Options:**
+
+| Variable | Options | Description |
+|----------|---------|-------------|
+| `CPUINFER_CPU_INSTRUCT` | `NATIVE`, `AVX512`, `AVX2`, `FANCY` | CPU instruction set to use |
+| `CPUINFER_ENABLE_AMX` | `ON`, `OFF` | Enable Intel AMX support |
+| `CPUINFER_BUILD_TYPE` | `Release`, `Debug`, `RelWithDebInfo` | Build type (default: `Release`) |
+| `CPUINFER_PARALLEL` | Number | Parallel build jobs (default: auto-detect) |
+| `CPUINFER_VERBOSE` | `0`, `1` | Verbose build output (default: `0`) |
+
+**Instruction Set Details:**
+
+- **`NATIVE`**: Auto-detect and use all available CPU instructions (`-march=native`) - **Recommended for best performance**
+- **`AVX512`**: Explicit AVX512 support for Skylake-SP and Cascade Lake
+- **`AVX2`**: AVX2 support for maximum compatibility
+- **`FANCY`**: AVX512 with full extensions (AVX512F/BW/DQ/VL/VNNI) for Ice Lake+ and Zen 4+. Use this when building pre-compiled binaries to distribute to users with modern CPUs. For local builds, prefer `NATIVE` for better performance.
+
+**Example Configurations:**
 
-### Verbose Build
 ```bash
+# Maximum performance on AMX CPU
+export CPUINFER_CPU_INSTRUCT=NATIVE
+export CPUINFER_ENABLE_AMX=ON
+
+# AVX512 CPU without AMX
+export CPUINFER_CPU_INSTRUCT=AVX512
+export CPUINFER_ENABLE_AMX=OFF
+
+# Compatibility build
+export CPUINFER_CPU_INSTRUCT=AVX2
+export CPUINFER_ENABLE_AMX=OFF
+
+# Debug build for development
+export CPUINFER_BUILD_TYPE=Debug
 export CPUINFER_VERBOSE=1
+```
+
+#### 3. Build and Install
+
+```bash
+# Editable installation (for development)
+pip install -e .
+
+# Standard installation
 pip install .
 ```
 
+## Error Troubleshooting
+
+### CUDA Not Found
+
+```
+ -- Looking for a CUDA compiler - NOTFOUND
+  CMake Error at CMakeLists.txt:389 (message):
+    KTRANSFORMERS_USE_CUDA=ON but CUDA compiler not found
+```
+
+Make sure you have the CUDA toolkit installed and `nvcc` is in your system PATH.
+
+Try `export CMAKE_ARGS="-D CMAKE_CUDA_COMPILER=$(which nvcc)"` and reinstall again.
+
+### hwloc Not Found
+
+Run `sudo apt install libhwloc-dev` if on a Debian-based system or build from source: https://www.open-mpi.org/projects/hwloc/.
+
+```
+wget https://download.open-mpi.org/release/hwloc/v2.12/hwloc-2.12.2.tar.gz
+tar -xzf hwloc-2.12.2.tar.gz
+cd hwloc-2.12.2
+./configure
+make
+sudo make install
+```
+
 ## Weight Quantization
 
 KT-Kernel provides weight quantization tools for CPU-GPU hybrid inference (e.g., integrating with SGLang). Both tools work together to enable heterogeneous expert placement across CPUs and GPUs.
diff --git a/kt-kernel/install.sh b/kt-kernel/install.sh
index aba5a90..297aa55 100755
--- a/kt-kernel/install.sh
+++ b/kt-kernel/install.sh
@@ -1,42 +1,240 @@
 #!/usr/bin/env bash
 set -e
 
+install_dependencies() {
+  echo "Checking and installing system dependencies..."
+
+  # Determine if we need to use sudo
+  SUDO=""
+  if [ "$EUID" -ne 0 ]; then
+    if command -v sudo &> /dev/null; then
+      SUDO="sudo"
+    else
+      echo "Warning: Not running as root and sudo not found. Package installation may fail."
+      echo "Please run as root or install sudo."
+    fi
+  fi
+
+  if command -v conda &> /dev/null; then
+    echo "Installing cmake via conda..."
+    conda install -y cmake
+  else
+    echo "Warning: conda not found. Skipping cmake installation via conda."
+    echo "Please install conda or manually install cmake."
+  fi
+
+  # Detect OS type
+  if [ -f /etc/os-release ]; then
+    . /etc/os-release
+    OS=$ID
+  elif [ -f /etc/debian_version ]; then
+    OS="debian"
+  elif [ -f /etc/redhat-release ]; then
+    OS="rhel"
+  else
+    echo "Warning: Unable to detect OS type. Skipping dependency installation."
+    return 0
+  fi
+
+  # Install dependencies based on OS
+  case "$OS" in
+    debian|ubuntu|linuxmint|pop)
+      echo "Detected Debian-based system. Installing libhwloc-dev and pkg-config..."
+      $SUDO apt update
+      $SUDO apt install -y libhwloc-dev pkg-config
+      ;;
+    fedora|rhel|centos|rocky|almalinux)
+      echo "Detected Red Hat-based system. Installing hwloc-devel and pkgconfig..."
+      $SUDO dnf install -y hwloc-devel pkgconfig || $SUDO yum install -y hwloc-devel pkgconfig
+      ;;
+    arch|manjaro)
+      echo "Detected Arch-based system. Installing hwloc and pkgconf..."
+      $SUDO pacman -S --noconfirm hwloc pkgconf
+      ;;
+    opensuse*|sles)
+      echo "Detected openSUSE-based system. Installing hwloc-devel and pkg-config..."
+      $SUDO zypper install -y hwloc-devel pkg-config
+      ;;
+    *)
+      echo "Warning: Unsupported OS '$OS'. Please manually install libhwloc-dev and pkg-config."
+      ;;
+  esac
+}
+
+install_dependencies
+
 usage() {
-  echo "Usage: $0 [avx|amx]"
+  cat <<EOF
+Usage: $0 [OPTIONS]
+
+This script builds kt-kernel with optimal settings for your CPU.
+
+OPTIONS:
+  (none)          Auto-detect CPU and configure automatically (recommended)
+  -h, --help      Show this help message
+  --manual        Skip auto-detection, use manual configuration (see below)
+
+AUTO-DETECTION (Default):
+  The script will automatically detect your CPU capabilities and configure:
+  - If AMX instructions detected → NATIVE + AMX=ON
+  - Otherwise                    → NATIVE + AMX=OFF
+
+MANUAL CONFIGURATION:
+  Use --manual flag and set these environment variables before running:
+
+  CPUINFER_CPU_INSTRUCT   - CPU instruction set
+                            Options: NATIVE, AVX512, AVX2
+  CPUINFER_ENABLE_AMX     - Enable Intel AMX support
+                            Options: ON, OFF
+
+Manual configuration examples:
+
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Configuration                    │ Use Case                             │
+├──────────────────────────────────┼──────────────────────────────────────┤
+│ NATIVE + AMX=ON                  │ Best performance on AMX CPUs         │
+│ AVX512 + AMX=OFF                 │ AVX512 CPUs without AMX              │
+│ AVX2 + AMX=OFF                   │ Older CPUs or maximum compatibility  │
+└──────────────────────────────────┴──────────────────────────────────────┘
+
+  Example manual build:
+    export CPUINFER_CPU_INSTRUCT=AVX512
+    export CPUINFER_ENABLE_AMX=OFF
+    $0 --manual
+
+Advanced option (for binary distribution):
+  FANCY - AVX512 with full extensions for Ice Lake+/Zen 4+
+          Use this when building pre-compiled binaries to distribute.
+
+Optional variables (with defaults):
+  CPUINFER_BUILD_TYPE=Release      Build type (Debug/RelWithDebInfo/Release)
+  CPUINFER_PARALLEL=8              Number of parallel build jobs
+  CPUINFER_VERBOSE=1               Verbose build output (0/1)
+
+EOF
   exit 1
 }
 
-if [ $# -ne 1 ]; then
+# Function to detect CPU features
+detect_cpu_features() {
+  local has_amx=0
+
+  if [ -f /proc/cpuinfo ]; then
+    # Check for AMX support on Linux
+    if grep -q "amx_tile\|amx_int8\|amx_bf16" /proc/cpuinfo; then
+      has_amx=1
+    fi
+  elif [ "$(uname)" = "Darwin" ]; then
+    # macOS doesn't have AMX (ARM or Intel without AMX)
+    has_amx=0
+  fi
+
+  echo "$has_amx"
+}
+
+# Check if user requested help
+if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
   usage
 fi
 
-MODE="$1"
-case "$MODE" in
-  avx)
-    export CPUINFER_CPU_INSTRUCT=AVX2
-    export CPUINFER_ENABLE_AMX=OFF
-    ;;
-  amx)
-    export CPUINFER_CPU_INSTRUCT=AMX512
+# Check if manual mode
+MANUAL_MODE=0
+if [ "$1" = "--manual" ]; then
+  MANUAL_MODE=1
+fi
+
+if [ "$MANUAL_MODE" = "0" ]; then
+  # Auto-detection mode
+  echo "=========================================="
+  echo "Auto-detecting CPU capabilities..."
+  echo "=========================================="
+  echo ""
+
+  HAS_AMX=$(detect_cpu_features)
+
+  if [ "$HAS_AMX" = "1" ]; then
+    echo "✓ AMX instructions detected"
+    export CPUINFER_CPU_INSTRUCT=NATIVE
     export CPUINFER_ENABLE_AMX=ON
-    ;;
-  *)
-    echo "Error: unknown mode '$MODE'"
+    echo "  Configuration: NATIVE + AMX=ON (best performance)"
+    echo ""
+    echo "  ⚠️  Note: If you plan to use LLAMAFILE backend, use manual mode:"
+    echo "     export CPUINFER_CPU_INSTRUCT=AVX512(AVX2/FANCY)"
+    echo "     export CPUINFER_ENABLE_AMX=OFF"
+    echo "     ./install.sh --manual"
+  else
+    echo "ℹ AMX instructions not detected"
+    export CPUINFER_CPU_INSTRUCT=NATIVE
+    export CPUINFER_ENABLE_AMX=OFF
+    echo "  Configuration: NATIVE + AMX=OFF"
+  fi
+
+  echo ""
+  echo "To use manual configuration instead, run: $0 --manual"
+  echo ""
+else
+  # Manual mode - validate user configuration (no exports)
+  if [ -z "$CPUINFER_CPU_INSTRUCT" ] || [ -z "$CPUINFER_ENABLE_AMX" ]; then
+    echo "Error: Manual mode requires CPUINFER_CPU_INSTRUCT and CPUINFER_ENABLE_AMX to be set."
+    echo ""
     usage
-    ;;
-esac
+  fi
 
-export CPUINFER_BUILD_TYPE=Release
-export CPUINFER_PARALLEL=8
-export CPUINFER_VERBOSE=1
+  # Validate CPUINFER_CPU_INSTRUCT
+  case "$CPUINFER_CPU_INSTRUCT" in
+    NATIVE|FANCY|AVX512|AVX2)
+      ;;
+    *)
+      echo "Error: Invalid CPUINFER_CPU_INSTRUCT='$CPUINFER_CPU_INSTRUCT'"
+      echo "Must be one of: NATIVE, FANCY, AVX512, AVX2"
+      exit 1
+      ;;
+  esac
 
-echo "Building in mode: $MODE"
-echo "Environment:"
+  # Validate CPUINFER_ENABLE_AMX
+  case "$CPUINFER_ENABLE_AMX" in
+    ON|OFF)
+      ;;
+    *)
+      echo "Error: Invalid CPUINFER_ENABLE_AMX='$CPUINFER_ENABLE_AMX'"
+      echo "Must be either: ON or OFF"
+      exit 1
+      ;;
+  esac
+
+  # Warn about problematic configuration
+  if [ "$CPUINFER_CPU_INSTRUCT" = "NATIVE" ] && [ "$CPUINFER_ENABLE_AMX" = "OFF" ]; then
+    HAS_AMX=$(detect_cpu_features)
+    if [ "$HAS_AMX" = "1" ]; then
+      echo "⚠️  WARNING: NATIVE + AMX=OFF on AMX-capable CPU may cause compilation issues!"
+      echo "   Recommended: Use AVX512 or AVX2 instead of NATIVE when AMX=OFF"
+      echo ""
+      read -p "Continue anyway? (y/N) " -n 1 -r
+      echo
+      if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        exit 1
+      fi
+    fi
+  fi
+fi
+
+# Set defaults for optional variables
+export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
+export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
+export CPUINFER_VERBOSE=${CPUINFER_VERBOSE:-1}
+
+echo "Building kt-kernel with configuration:"
 echo "  CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
 echo "  CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
 echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
 echo "  CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
 echo "  CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
+echo ""
 
-pip install -e . -v
+pip install . -v
 
+
+echo "Successfully built and installed kt-kernel! with configuration:"
+echo "  CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
+echo "  CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
+echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
\ No newline at end of file
diff --git a/kt-kernel/pyproject.toml b/kt-kernel/pyproject.toml
index 9c27f73..5004cdb 100644
--- a/kt-kernel/pyproject.toml
+++ b/kt-kernel/pyproject.toml
@@ -36,10 +36,13 @@ dependencies = [
 Homepage = "https://github.com/kvcache-ai"
 
 [tool.setuptools]
-# Enable Python package (kt_kernel) and compiled extension (kt_kernel_ext)
-packages = ["kt_kernel"]
+packages = ["kt_kernel", "kt_kernel.utils"]
 include-package-data = true
 
+[tool.setuptools.package-dir]
+kt_kernel = "python"
+"kt_kernel.utils" = "python/utils"
+
 [tool.setuptools.package-data]
 # (empty) placeholder if you later add resources
 
diff --git a/kt-kernel/python/utils/__init__.py b/kt-kernel/python/utils/__init__.py
new file mode 100644
index 0000000..f71809b
--- /dev/null
+++ b/kt-kernel/python/utils/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Utilities for kt_kernel package.
+"""
+
+from .amx import AMXMoEWrapper
+from .llamafile import LlamafileMoEWrapper
+from .loader import SafeTensorLoader, GGUFLoader
+
+__all__ = [
+    "AMXMoEWrapper",
+    "LlamafileMoEWrapper",
+    "SafeTensorLoader",
+    "GGUFLoader",
+]
diff --git a/kt-kernel/setup.py b/kt-kernel/setup.py
index be043b8..3860f35 100644
--- a/kt-kernel/setup.py
+++ b/kt-kernel/setup.py
@@ -335,8 +335,11 @@ setup(
     author="kvcache-ai",
     license="Apache-2.0",
     python_requires=">=3.8",
-    packages=["kt_kernel"],
-    package_dir={"kt_kernel": "python"},
+    packages=["kt_kernel", "kt_kernel.utils"],
+    package_dir={
+        "kt_kernel": "python",
+        "kt_kernel.utils": "python/utils",
+    },
     ext_modules=[CMakeExtension("kt_kernel_ext", str(REPO_ROOT))],
     cmdclass={"build_ext": CMakeBuild},
     zip_safe=False,