Fix CPU Instruction Set and Installation (#1729)

* [fix](kt-kernel): fix AVX512 cpu instruction set detection * [feat](kt-kernel): AVX512 fallback kernel for RAW-INT4 * [fix](kt-kernel): fix setup version issue * [fix](kt-kernel): update install for custom build * [docs](kt-kernel): new installation guide for various cpu instruction set * [fix](kt-kernel): fix _mm512_dpbusd_epi32_compat fallback implmentation * [style](kt-kernel): clang format
2026-04-20 14:29:22 +00:00 · 2025-12-18 00:11:57 +08:00
parent a8667ddb58
commit 3c134359bc
11 changed files with 545 additions and 478 deletions
--- a/kt-kernel/CMakeLists.txt
+++ b/kt-kernel/CMakeLists.txt
@@ -28,7 +28,7 @@ option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF
 # LTO control
 option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)

-project(kt_kernel_ext VERSION 0.4.2)
+project(kt_kernel_ext VERSION 0.4.4)
 # Choose compilers BEFORE project() so CMake honors them
 if(USE_CONDA_TOOLCHAIN)
    if(NOT DEFINED ENV{CONDA_PREFIX} OR NOT EXISTS "$ENV{CONDA_PREFIX}")
@@ -378,7 +378,20 @@ if(HOST_IS_X86)
                target_link_libraries(${test_name} llama OpenMP::OpenMP_CXX numa)
            endforeach()
        endif()
-        list(APPEND ARCH_FLAGS -mfma -mf16c -mavx512bf16 -mavx512vnni)
+        # Note: AVX512 subset flags (-mavx512vnni, -mavx512bf16) are already added
+        # in the generic x86 detection block above (lines 276-289) when corresponding
+        # LLAMA_AVX512_* options are enabled. No need to add them again here.
+        # -mfma is already added by LLAMA_NATIVE (line 254), LLAMA_AVX*, or LLAMA_FMA blocks.
+        # Only add -mf16c if LLAMA_F16C is not already enabled.
+        if(NOT LLAMA_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if(LLAMA_AVX512_VNNI)
+            message(STATUS "AVX512_VNNI enabled")
+        endif()
+        if(LLAMA_AVX512_BF16)
+            message(STATUS "AVX512_BF16 enabled")
+        endif()
    endif()
 endif()

--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -37,6 +37,7 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
 - ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
 - ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
 - ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
+- ✅ **Kimi-K2 Native INT4 (RAWINT4)**: Supported on AVX512 CPUs (CPU-GPU shared INT4 weights) - [Guide](../doc/en/Kimi-K2-Thinking-Native.md)

 ## Features

@@ -49,6 +50,8 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo

 ### Option 1: Install from PyPI (Recommended for Most Users)

+Coming soon...
+
 Choose the version matching your CUDA installation:

 ```bash
@@ -104,76 +107,55 @@ python -c "import kt_kernel"

 ---

-### Option 2: Install from Source (For AMD, ARM, or Custom Builds)
+### Option 2: Install from Source (For Local Use or Custom Builds)

-If you need AMD (BLIS), ARM (KML), or custom CUDA versions, build from source:
+Build from source for local installation or when you need AMD (BLIS), ARM (KML), or custom CUDA versions.

 #### Prerequisites

-First, initialize git submodules:
+First, initialize git submodules and create a conda environment:
 ```bash
 git submodule update --init --recursive
-```
-
-#### Quick Installation
-
-Step 0: Create and activate a conda environment (recommended):
-
-```bash
 conda create -n kt-kernel python=3.11 -y
 conda activate kt-kernel
 ```

-You can now install in two clear steps using the same script.
+#### Quick Installation (Recommended)

-**Option A: Two-step** (specify dependencies installation and build separately)
-
-```bash
-# 1) Install system prerequisites (cmake, hwloc, pkg-config)
-./install.sh deps
-
-# 2) Build and install kt-kernel (auto-detects CPU instruction set)
-#    By default, the script cleans the local ./build directory before compiling
-./install.sh build
-```
-
-**Option B: One-step**
+Simply run the install script - it will auto-detect your CPU and optimize for best performance:

 ```bash
 ./install.sh
 ```

-The install script will:
- Auto-detect CPU capabilities (AMX support)
- Install `cmake` via conda (if available)
- Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
-
-**What gets configured automatically:**
- AMX CPU detected → `NATIVE + AMX=ON`
- No AMX detected → `NATIVE + AMX=OFF`
-
-⚠️ **Important for LLAMAFILE backend users:**
-If you have an AMX-capable CPU but plan to use the LLAMAFILE backend, do NOT use the default auto-detection build.
-Use "manual mode" with `CPUINFER_CPU_INSTRUCT` set to `AVX512` or `AVX2` instead of `NATIVE` to avoid compilation issues (see below).
-
-⚠️ **Important for BLIS AMD backend users:**
-for the installation guide, see this [issue](https://github.com/kvcache-ai/ktransformers/issues/1601)
-
-
-### Manual Configuration (Advanced)
-
-If you need specific build options (e.g., for LLAMAFILE backend, compatibility, or binary distribution):
+**What happens automatically:**
+- Auto-detects CPU capabilities (AMX, AVX512_VNNI, AVX512_BF16)
+- Installs system dependencies (`cmake`, `libhwloc-dev`, `pkg-config`)
+- Builds optimized binary for **your CPU only** (using `-march=native`)
+- **Software fallbacks**: Automatically enabled for CPUs without VNNI/BF16

+**Optional: Two-step installation**
 ```bash
-# Example for LLAMAFILE backend on AMX CPU with AVX512
-export CPUINFER_CPU_INSTRUCT=AVX512  # Options: NATIVE, AVX512, AVX2, FANCY
-export CPUINFER_ENABLE_AMX=OFF       # Options: ON, OFF
-
-# Build only (skip auto-detection of instruction set)
-./install.sh build --manual
+./install.sh deps   # Install dependencies only
+./install.sh build  # Build and install kt-kernel
 ```

-For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
+**CPU Requirements by Backend:**
+
+| Backend | Minimum CPU Requirement | Example CPUs | Notes |
+|---------|-------------------------|--------------|-------|
+| **LLAMAFILE** | AVX2 | Intel Haswell (2013+), AMD Zen+ | Universal compatibility |
+| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+), Ice Lake, Cascade Lake | Software fallbacks for VNNI/BF16 |
+| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | Best performance, requires AMX hardware |
+
+**Software Fallback Support (AVX512 backends):**
+- ✅ VNNI fallback: Uses AVX512BW instructions
+- ✅ BF16 fallback: Uses AVX512F instructions
+- ✅ Older AVX512 CPUs (Skylake-X, Cascade Lake) can run RAWINT4 with fallbacks
+
+⚠️ **Portability Note:** The default build is optimized for your specific CPU and may not work on different/older CPUs. For portable builds or binary distribution, see [Manual Configuration](#manual-configuration-advanced) below.
+
+⚠️ **AMD BLIS backend users:** See [installation guide](https://github.com/kvcache-ai/ktransformers/issues/1601) for AMD-specific setup.

 ## Verification

@@ -482,11 +464,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
 KTMoEWrapper.clear_buffer_cache()
 ```

+### Manual Configuration (Advanced)
+
+For portable builds, binary distribution, or cross-machine deployment, you need to manually specify target instruction sets:
+
+```bash
+# General distribution (works on any AVX512 CPU from 2017+)
+export CPUINFER_CPU_INSTRUCT=AVX512
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+
+# Maximum compatibility (works on any CPU from 2013+)
+export CPUINFER_CPU_INSTRUCT=AVX2
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+
+# Modern CPUs only (Ice Lake+, Zen 4+)
+export CPUINFER_CPU_INSTRUCT=FANCY
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+```
+
+**Optional: Override VNNI/BF16 detection**
+```bash
+# Force enable/disable VNNI and BF16 (for testing fallbacks)
+export CPUINFER_ENABLE_AVX512_VNNI=OFF
+export CPUINFER_ENABLE_AVX512_BF16=OFF
+./install.sh
+```
+
+See `./install.sh --help` for all available options.
+
+---
+
 ## Build Configuration

-### Manual Installation
+### Manual Installation (Without install.sh)

-If you prefer manual installation without the `install.sh` script, follow these steps:
+If you prefer manual installation without the `install.sh` script:

 #### 1. Install System Dependencies

@@ -508,27 +523,29 @@ If you prefer manual installation without the `install.sh` script, follow these

 **Instruction Set Details:**

- **`NATIVE`**: Auto-detect and use all available CPU instructions (`-march=native`) - **Recommended for best performance**
- **`AVX512`**: Explicit AVX512 support for Skylake-SP and Cascade Lake
- **`AVX2`**: AVX2 support for maximum compatibility
- **`FANCY`**: AVX512 with full extensions (AVX512F/BW/DQ/VL/VNNI) for Ice Lake+ and Zen 4+. Use this when building pre-compiled binaries to distribute to users with modern CPUs. For local builds, prefer `NATIVE` for better performance.
+| Option | Target CPUs | Use Case |
+|--------|-------------|----------|
+| **`NATIVE`** | Your specific CPU only | Local builds (best performance, **default**) |
+| **`AVX512`** | Skylake-X, Ice Lake, Cascade Lake, Zen 4+ | General distribution |
+| **`AVX2`** | Haswell (2013) and newer | Maximum compatibility |
+| **`FANCY`** | Ice Lake+, Zen 4+ | Modern CPUs with full AVX512 extensions |

 **Example Configurations:**

 ```bash
-# Maximum performance on AMX CPU
+# Local use - maximum performance (default behavior)
 export CPUINFER_CPU_INSTRUCT=NATIVE
-export CPUINFER_ENABLE_AMX=ON
+export CPUINFER_ENABLE_AMX=ON  # or OFF

-# AVX512 CPU without AMX
+# Distribution build - works on any AVX512 CPU
 export CPUINFER_CPU_INSTRUCT=AVX512
 export CPUINFER_ENABLE_AMX=OFF

-# Compatibility build
+# Maximum compatibility - works on CPUs since 2013
 export CPUINFER_CPU_INSTRUCT=AVX2
 export CPUINFER_ENABLE_AMX=OFF

-# Debug build for development
+# Debug build
 export CPUINFER_BUILD_TYPE=Debug
 export CPUINFER_VERBOSE=1
 ```
--- a/kt-kernel/README_zh.md
+++ b/kt-kernel/README_zh.md
@@ -38,6 +38,7 @@
 - ✅ **带 AMX 的 Intel CPU**：已支持（基于转换为 INT4/INT8 格式的权重）
 - ✅ **通用 CPU（llamafile 后端）**：已支持（基于 GGUF 格式的权重）
 - ✅ **带 BLIS 的 AMD CPU**：已支持（int8 的 prefill 和 decode）
+- ✅ **Kimi-K2 原生 INT4（RAWINT4）**：支持 AVX512 CPU（CPU-GPU 共享 INT4 权重）- [使用指南](../doc/en/Kimi-K2-Thinking-Native.md)

 ## 特性

@@ -49,69 +50,55 @@

 ## 安装

-### 先决条件
+### 从源码安装（本机使用或自定义构建）

-首先初始化子模块：
+适用于本地安装，或需要 AMD (BLIS)、ARM (KML) 或自定义 CUDA 版本的场景。
+
+#### 先决条件
+
+首先初始化子模块并创建 conda 环境：
 ```bash
 git submodule update --init --recursive
-```
-
-### 快速安装（推荐）
-
-第 0 步：创建并激活一个 conda 环境（推荐）：
-
-```bash
 conda create -n kt-kernel python=3.11 -y
 conda activate kt-kernel
 ```

-随后可以用同一个脚本分两步或一步安装。
+#### 快速安装（推荐）

-方案 A：两步（可以指定依赖安装与编译构建）
-
-```bash
-# 1）安装系统依赖（cmake, hwloc, pkg-config）
-./install.sh deps
-
-# 2）构建并安装 kt-kernel（自动检测 CPU 指令集）
-#    默认会在编译前清理本地 ./build 目录
-./install.sh build
-```
-
-方案 B：一步
+只需运行安装脚本，它会自动检测 CPU 并优化性能：

 ```bash
 ./install.sh
 ```

-安装脚本会：
- 自动检测 CPU 能力（是否支持 AMX）
- 尝试通过 conda 安装 `cmake`（若可用）
- 根据你的操作系统安装系统依赖（`libhwloc-dev`、`pkg-config`）
-
-**自动配置内容：**
- 检测到 AMX CPU → 使用 `NATIVE + AMX=ON`
- 未检测到 AMX → 使用 `NATIVE + AMX=OFF`
-
-⚠️ **LLAMAFILE 后端用户特别说明：**  
-如果你有带 AMX 的 CPU，但是计划使用 LLAMAFILE 后端，请不要使用默认的自动检测构建方式。  
-请使用“手动模式”，并将 `CPUINFER_CPU_INSTRUCT` 设为 `AVX512` 或 `AVX2` 而非 `NATIVE`，以避免编译期异常（见下文）。
-
-### 手动配置（进阶）
-
-如果你需要更精细的构建选项（例如为 LLAMAFILE 后端、兼容性或二进制分发配置）：
+**自动完成的操作：**
+- 自动检测 CPU 能力（AMX、AVX512_VNNI、AVX512_BF16）
+- 安装系统依赖（`cmake`、`libhwloc-dev`、`pkg-config`）
+- 为**你的 CPU** 构建优化二进制（使用 `-march=native`）
+- **软件回退机制**：为不支持 VNNI/BF16 的 CPU 自动启用

+**可选：分步安装**
 ```bash
-# 在带 AMX 的 CPU 上构建 LLAMAFILE 后端的示例（使用 AVX512）
-export CPUINFER_CPU_INSTRUCT=AVX512  # 选项: NATIVE, AVX512, AVX2, FANCY
-export CPUINFER_ENABLE_AMX=OFF       # 选项: ON, OFF
-
-# 仅构建（不进行指令集的自动检测）
-./install.sh build --manual
+./install.sh deps   # 仅安装依赖
+./install.sh build  # 构建并安装 kt-kernel
 ```

-更多构建选项和二进制分发配置，请参见 [构建配置](#构建配置) 一节。  
-如果遇到问题，可参考 [错误排查](#错误排查)。
+**不同后端的 CPU 要求：**
+
+| 后端 | 最低 CPU 要求 | 示例 CPU | 说明 |
+|------|---------------|----------|------|
+| **LLAMAFILE** | AVX2 | Intel Haswell (2013+)、AMD Zen+ | 通用兼容性 |
+| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+)、Ice Lake、Cascade Lake | 支持 VNNI/BF16 软件回退 |
+| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | 最佳性能，需要 AMX 硬件 |
+
+**软件回退支持（AVX512 后端）：**
+- ✅ VNNI 回退：使用 AVX512BW 指令
+- ✅ BF16 回退：使用 AVX512F 指令
+- ✅ 老的 AVX512 CPU（Skylake-X、Cascade Lake）可以运行 RAWINT4（使用回退）
+
+⚠️ **可移植性说明：** 默认构建针对你的特定 CPU 优化，可能无法在不同/更老的 CPU 上运行。如需打包分发或跨机器部署，请参见下方的 [手动配置](#手动配置进阶)。
+
+⚠️ **AMD BLIS 后端用户：** 请参见 [安装指南](https://github.com/kvcache-ai/ktransformers/issues/1601) 了解 AMD 专用配置。

 ## 验证安装

@@ -421,11 +408,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
 KTMoEWrapper.clear_buffer_cache()
 ```

+### 手动配置（进阶）
+
+如需打包分发、跨机器部署或构建可移植二进制，需要手动指定目标指令集：
+
+```bash
+# 通用分发版（适用于 2017+ 的任何 AVX512 CPU）
+export CPUINFER_CPU_INSTRUCT=AVX512
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+
+# 最大兼容性（适用于 2013+ 的任何 CPU）
+export CPUINFER_CPU_INSTRUCT=AVX2
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+
+# 仅限现代 CPU（Ice Lake+、Zen 4+）
+export CPUINFER_CPU_INSTRUCT=FANCY
+export CPUINFER_ENABLE_AMX=OFF
+./install.sh build --manual
+```
+
+**可选：覆盖 VNNI/BF16 检测**
+```bash
+# 强制启用/禁用 VNNI 和 BF16（用于测试回退）
+export CPUINFER_ENABLE_AVX512_VNNI=OFF
+export CPUINFER_ENABLE_AVX512_BF16=OFF
+./install.sh
+```
+
+运行 `./install.sh --help` 查看所有可用选项。
+
+---
+
 ## 构建配置

-### 手动安装
+### 手动安装（不使用 install.sh）

-如果你不想使用 `install.sh`，可以按以下步骤手动构建：
+如果你不想使用 `install.sh` 脚本：

 #### 1. 安装系统依赖

@@ -447,24 +467,25 @@ KTMoEWrapper.clear_buffer_cache()

 **指令集说明：**

- **`NATIVE`**：自动检测并启用所有可用 CPU 指令（`-march=native`）——**本机运行时首选**
- **`AVX512`**：为 Skylake-SP / Cascade Lake 显式开启 AVX512
- **`AVX2`**：开启 AVX2，兼容性较好
- **`FANCY`**：开启完整 AVX512 扩展（AVX512F/BW/DQ/VL/VNNI），适用于 Ice Lake+ 和 Zen 4+ 等较新平台。  
-  用于向用户分发预编译二进制时推荐；本地构建推荐使用 `NATIVE` 以获得更优性能。
+| 选项 | 目标 CPU | 使用场景 |
+|------|----------|----------|
+| **`NATIVE`** | 仅限你的特定 CPU | 本地构建（最佳性能，**默认**） |
+| **`AVX512`** | Skylake-X、Ice Lake、Cascade Lake、Zen 4+ | 通用分发 |
+| **`AVX2`** | Haswell (2013) 及更新 | 最大兼容性 |
+| **`FANCY`** | Ice Lake+、Zen 4+ | 具有完整 AVX512 扩展的现代 CPU |

 **配置示例：**

 ```bash
-# 在 AMX CPU 上获得最高性能
+# 本地使用 - 最高性能（默认行为）
 export CPUINFER_CPU_INSTRUCT=NATIVE
-export CPUINFER_ENABLE_AMX=ON
+export CPUINFER_ENABLE_AMX=ON  # 或 OFF

-# 仅 AVX512，无 AMX
+# 分发构建 - 适用于任何 AVX512 CPU
 export CPUINFER_CPU_INSTRUCT=AVX512
 export CPUINFER_ENABLE_AMX=OFF

-# 兼容性优先构建
+# 最大兼容性 - 适用于 2013 年以来的 CPU
 export CPUINFER_CPU_INSTRUCT=AVX2
 export CPUINFER_ENABLE_AMX=OFF

--- a/kt-kernel/ext_bindings.cpp
+++ b/kt-kernel/ext_bindings.cpp
@@ -229,8 +229,7 @@ void bind_moe_module(py::module_& moe_module, const char* name) {

  auto moe_cls = py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name);

-  moe_cls
-      .def(py::init<GeneralMOEConfig>())
+  moe_cls.def(py::init<GeneralMOEConfig>())
      .def("warm_up_task", &MoeBindings::WarmUpBindings::cpuinfer_interface)
      .def("load_weights_task",
           py::overload_cast<std::shared_ptr<MoeClass>>(&MoeBindings::LoadWeightsBindings::cpuinfer_interface))
@@ -265,16 +264,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {

      static void inner(void* args) {
        Args* args_ = (Args*)args;
-        args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe,
-                                 args_->gpu_tp_count, args_->gpu_experts_num,
-                                 args_->w13_weight_ptrs, args_->w13_scale_ptrs,
+        args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe, args_->gpu_tp_count,
+                                 args_->gpu_experts_num, args_->w13_weight_ptrs, args_->w13_scale_ptrs,
                                 args_->w2_weight_ptrs, args_->w2_scale_ptrs);
      }

-      static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe,
-                                                              int gpu_tp_count, int gpu_experts_num,
-                                                              py::list w13_weight_ptrs, py::list w13_scale_ptrs,
-                                                              py::list w2_weight_ptrs, py::list w2_scale_ptrs) {
+      static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe, int gpu_tp_count,
+                                                              int gpu_experts_num, py::list w13_weight_ptrs,
+                                                              py::list w13_scale_ptrs, py::list w2_weight_ptrs,
+                                                              py::list w2_scale_ptrs) {
        // Convert Python lists to std::vector<uintptr_t>
        std::vector<uintptr_t> w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec;

@@ -283,16 +281,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
        for (auto item : w2_weight_ptrs) w2_weight_vec.push_back(py::cast<uintptr_t>(item));
        for (auto item : w2_scale_ptrs) w2_scale_vec.push_back(py::cast<uintptr_t>(item));

-        Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
+        Args* args = new Args{nullptr,        moe.get(),     gpu_tp_count,  gpu_experts_num,
                              w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec};
        return std::make_pair((intptr_t)&inner, (intptr_t)args);
      }
    };

    moe_cls.def("write_weight_scale_to_buffer_task", &WriteWeightScaleToBufferBindings::cpuinfer_interface,
-             py::arg("gpu_tp_count"), py::arg("gpu_experts_num"),
-             py::arg("w13_weight_ptrs"), py::arg("w13_scale_ptrs"),
-             py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
+                py::arg("gpu_tp_count"), py::arg("gpu_experts_num"), py::arg("w13_weight_ptrs"),
+                py::arg("w13_scale_ptrs"), py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
  }
 #endif
 }
--- a/kt-kernel/install.sh
+++ b/kt-kernel/install.sh
@@ -19,41 +19,65 @@ BUILD_OPTIONS (for "build" or "all"):
  --no-clean      Do not delete local build/ before building (default cleans)

 AUTO-DETECTION (Default):
-  The script will automatically detect your CPU capabilities and configure:
-  - If AMX instructions detected → NATIVE + AMX=ON
-  - Otherwise                    → NATIVE + AMX=OFF
+  The script will automatically detect your CPU and use ALL available features:
+  - CPUINFER_CPU_INSTRUCT = NATIVE (uses -march=native)
+  - CPUINFER_ENABLE_AMX   = ON/OFF (based on detection)
+  - CPUINFER_ENABLE_AVX512_VNNI = ON/OFF (with fallback if OFF)
+  - CPUINFER_ENABLE_AVX512_BF16 = ON/OFF (with fallback if OFF)
+
+  ✓ Best performance on YOUR machine
+  ✗ Binary may NOT work on different/older CPUs
+
+  Use this when: Installing for local use only

 MANUAL CONFIGURATION:
-  Use --manual flag and set these environment variables before running:
+  Use --manual flag when building for DISTRIBUTION or different machines.
+  Set these environment variables before running:

-  CPUINFER_CPU_INSTRUCT   - CPU instruction set
-                            Options: NATIVE, AVX512, AVX2, FANCY
+  CPUINFER_CPU_INSTRUCT   - Target CPU instruction set
+                            Options: AVX512, AVX2, FANCY, NATIVE
  CPUINFER_ENABLE_AMX     - Enable Intel AMX support
                            Options: ON, OFF

-Manual configuration examples:
+Distribution examples (portable binaries):

-┌─────────────────────────────────────────────────────────────────────────┐
-│ Configuration                    │ Use Case                             │
-├──────────────────────────────────┼──────────────────────────────────────┤
-│ NATIVE + AMX=ON                  │ Best performance on AMX CPUs         │
-│ AVX512 + AMX=OFF                 │ AVX512 CPUs without AMX              │
-│ AVX2 + AMX=OFF                   │ Older CPUs or maximum compatibility  │
-└──────────────────────────────────┴──────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────────────────────┐
+│ Configuration          │ Target CPUs              │ Use Case             │
+├────────────────────────┼──────────────────────────┼──────────────────────┤
+│ AVX512 + AMX=OFF       │ Skylake-X, Ice Lake,     │ General distribution │
+│                        │ Cascade Lake, Zen 4      │ (recommended)        │
+├────────────────────────┼──────────────────────────┼──────────────────────┤
+│ AVX2 + AMX=OFF         │ Haswell (2013) and newer │ Maximum compatibility│
+├────────────────────────┼──────────────────────────┼──────────────────────┤
+│ FANCY + AMX=OFF        │ Ice Lake+, Zen 4+        │ Modern CPUs only     │
+│                        │ (with full AVX512 ext)   │                      │
+└────────────────────────┴──────────────────────────┴──────────────────────┘

-  Example manual build:
+  Use this when: Building Docker images, PyPI packages, or deploying to clusters
+
+  Example: Build for general distribution
    export CPUINFER_CPU_INSTRUCT=AVX512
    export CPUINFER_ENABLE_AMX=OFF
    $0 build --manual
+    # Result: Works on any CPU with AVX512 (2017+)

-Advanced option (for binary distribution):
-  FANCY - AVX512 with full extensions for Ice Lake+/Zen 4+
-          Use this when building pre-compiled binaries to distribute.
+  Example: Build for maximum compatibility
+    export CPUINFER_CPU_INSTRUCT=AVX2
+    export CPUINFER_ENABLE_AMX=OFF
+    $0 build --manual
+    # Result: Works on any CPU with AVX2 (2013+)

 Optional variables (with defaults):
-  CPUINFER_BUILD_TYPE=Release      Build type (Debug/RelWithDebInfo/Release)
-  CPUINFER_PARALLEL=8              Number of parallel build jobs
-  CPUINFER_VERBOSE=1               Verbose build output (0/1)
+  CPUINFER_BUILD_TYPE=Release           Build type (Debug/RelWithDebInfo/Release)
+  CPUINFER_PARALLEL=8                   Number of parallel build jobs
+  CPUINFER_VERBOSE=1                    Verbose build output (0/1)
+  CPUINFER_ENABLE_AVX512_VNNI=ON/OFF    Override VNNI detection (auto if unset)
+  CPUINFER_ENABLE_AVX512_BF16=ON/OFF    Override BF16 detection (auto if unset)
+
+Software Fallback Support:
+  ✓ If VNNI not available: Uses AVX512BW fallback (2-3x slower but works)
+  ✓ If BF16 not available: Uses AVX512F fallback (5-10x slower but works)
+  → Old CPUs with only AVX512F+BW can run all code (slower but functional)

 EOF
  exit 1
@@ -120,20 +144,38 @@ install_dependencies() {
 }

 # Function to detect CPU features
+# Returns: "has_amx has_avx512_vnni has_avx512_bf16" (space-separated 0/1 values)
 detect_cpu_features() {
  local has_amx=0
+  local has_avx512_vnni=0
+  local has_avx512_bf16=0

  if [ -f /proc/cpuinfo ]; then
+    local cpu_flags
+    cpu_flags=$(grep -m1 "^flags" /proc/cpuinfo | tr ' ' '\n')
+
    # Check for AMX support on Linux
-    if grep -q "amx_tile\|amx_int8\|amx_bf16" /proc/cpuinfo; then
+    if echo "$cpu_flags" | grep -qE "amx_tile|amx_int8|amx_bf16"; then
      has_amx=1
    fi
+
+    # Check for AVX512_VNNI support
+    if echo "$cpu_flags" | grep -qE "avx512_vnni|avx512vnni"; then
+      has_avx512_vnni=1
+    fi
+
+    # Check for AVX512_BF16 support
+    if echo "$cpu_flags" | grep -qE "avx512_bf16|avx512bf16"; then
+      has_avx512_bf16=1
+    fi
  elif [ "$(uname)" = "Darwin" ]; then
    # macOS doesn't have AMX (ARM or Intel without AMX)
    has_amx=0
+    has_avx512_vnni=0
+    has_avx512_bf16=0
  fi

-  echo "$has_amx"
+  echo "$has_amx $has_avx512_vnni $has_avx512_bf16"
 }

 build_step() {
@@ -161,34 +203,6 @@ build_step() {
    echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
  fi

-  # Check for multi-variant build mode (Docker environment)
-  if [ "${CPUINFER_BUILD_ALL_VARIANTS:-0}" = "1" ]; then
-    echo "=========================================="
-    echo "Building ALL CPU variants (AMX/AVX512/AVX2)"
-    echo "=========================================="
-    echo ""
-    echo "This will build three variants in a single wheel:"
-    echo "  - AMX variant (Intel Sapphire Rapids+)"
-    echo "  - AVX512 variant (Intel Skylake-X/Ice Lake+)"
-    echo "  - AVX2 variant (maximum compatibility)"
-    echo ""
-    echo "Runtime CPU detection will automatically select the best variant."
-    echo ""
-
-    export CPUINFER_FORCE_REBUILD=1
-    export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
-    export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
-
-    echo "Building with:"
-    echo "  CPUINFER_BUILD_ALL_VARIANTS=1"
-    echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
-    echo "  CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
-    echo ""
-
-    pip install . -v
-    return 0
-  fi
-
  if [ "$MANUAL_MODE" = "0" ]; then
  # Auto-detection mode
  echo "=========================================="
@@ -196,25 +210,70 @@ build_step() {
  echo "=========================================="
  echo ""

-  HAS_AMX=$(detect_cpu_features)
+  # detect_cpu_features returns "has_amx has_avx512_vnni has_avx512_bf16"
+  CPU_FEATURES=$(detect_cpu_features)
+  HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
+  HAS_AVX512_VNNI=$(echo "$CPU_FEATURES" | cut -d' ' -f2)
+  HAS_AVX512_BF16=$(echo "$CPU_FEATURES" | cut -d' ' -f3)
+
+  export CPUINFER_CPU_INSTRUCT=NATIVE

  if [ "$HAS_AMX" = "1" ]; then
    echo "✓ AMX instructions detected"
-    export CPUINFER_CPU_INSTRUCT=NATIVE
    export CPUINFER_ENABLE_AMX=ON
-    echo "  Configuration: NATIVE + AMX=ON (best performance)"
    echo ""
-    echo "  ⚠️  Note: If you plan to use LLAMAFILE backend, use manual mode:"
-    echo "     export CPUINFER_CPU_INSTRUCT=AVX512  # or AVX2/FANCY"
-    echo "     export CPUINFER_ENABLE_AMX=OFF"
-    echo "     ./install.sh build --manual"
+    echo "Configuration: NATIVE + AMX=ON"
+    echo "  ✓ Best performance on this machine"
+    echo "  ✗ Binary requires Sapphire Rapids or newer CPU"
  else
    echo "ℹ AMX instructions not detected"
-    export CPUINFER_CPU_INSTRUCT=NATIVE
    export CPUINFER_ENABLE_AMX=OFF
-    echo "  Configuration: NATIVE + AMX=OFF"
+    echo ""
+    echo "Configuration: NATIVE + AMX=OFF"
+    echo "  ✓ Using AVX512/AVX2 instructions"
  fi

+  echo ""
+  echo "  ⚠️  IMPORTANT: This binary is optimized for THIS CPU only"
+  echo "     To build portable binaries for distribution, use:"
+  echo "       export CPUINFER_CPU_INSTRUCT=AVX512  # or AVX2"
+  echo "       export CPUINFER_ENABLE_AMX=OFF"
+  echo "       ./install.sh build --manual"
+
+  # Fine-grained AVX512 subset detection (with fallback support)
+  echo ""
+  echo "AVX512 Feature Detection:"
+
+  # VNNI: Check if user manually set it, otherwise auto-detect
+  if [ -n "${CPUINFER_ENABLE_AVX512_VNNI:-}" ]; then
+    echo "  VNNI: User override = $CPUINFER_ENABLE_AVX512_VNNI"
+  else
+    if [ "$HAS_AVX512_VNNI" = "1" ]; then
+      echo "  VNNI: ✓ Detected (hardware acceleration enabled)"
+      export CPUINFER_ENABLE_AVX512_VNNI=ON
+    else
+      echo "  VNNI: ✗ Not detected (will use software fallback, 2-3x slower)"
+      export CPUINFER_ENABLE_AVX512_VNNI=OFF
+    fi
+  fi
+
+  # BF16: Check if user manually set it, otherwise auto-detect
+  if [ -n "${CPUINFER_ENABLE_AVX512_BF16:-}" ]; then
+    echo "  BF16: User override = $CPUINFER_ENABLE_AVX512_BF16"
+  else
+    if [ "$HAS_AVX512_BF16" = "1" ]; then
+      echo "  BF16: ✓ Detected (hardware acceleration enabled)"
+      export CPUINFER_ENABLE_AVX512_BF16=ON
+    else
+      echo "  BF16: ✗ Not detected (will use software fallback, 5-10x slower)"
+      export CPUINFER_ENABLE_AVX512_BF16=OFF
+    fi
+  fi
+
+  echo ""
+  echo "  Note: Software fallbacks ensure all code works on older CPUs"
+  echo "  Tip: Override with CPUINFER_ENABLE_AVX512_VNNI/BF16=ON/OFF"
+
  echo ""
  echo "To use manual configuration instead, run: $0 build --manual"
  echo ""
@@ -250,12 +309,32 @@ build_step() {

  # Warn about problematic configuration
  if [ "$CPUINFER_CPU_INSTRUCT" = "NATIVE" ] && [ "$CPUINFER_ENABLE_AMX" = "OFF" ]; then
-    HAS_AMX=$(detect_cpu_features)
+    CPU_FEATURES=$(detect_cpu_features)
+    HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
    if [ "$HAS_AMX" = "1" ]; then
-      echo "⚠️  WARNING: NATIVE + AMX=OFF on AMX-capable CPU may cause compilation issues!"
-      echo "   Recommended: Use AVX512 or AVX2 instead of NATIVE when AMX=OFF"
+      echo "=========================================="
+      echo "⚠️  WARNING: Risky Configuration"
+      echo "=========================================="
      echo ""
-      read -p "Continue anyway? (y/N) " -n 1 -r
+      echo "Your configuration:"
+      echo "  CPUINFER_CPU_INSTRUCT = NATIVE"
+      echo "  CPUINFER_ENABLE_AMX   = OFF"
+      echo ""
+      echo "Your CPU HAS AMX support!"
+      echo ""
+      echo "Problem:"
+      echo "  • NATIVE uses -march=native which auto-enables ALL CPU features"
+      echo "  • This may IGNORE your AMX=OFF setting"
+      echo "  • The binary may still contain AMX instructions"
+      echo ""
+      echo "Recommended fixes:"
+      echo "  1) For portable build (recommended for distribution):"
+      echo "       export CPUINFER_CPU_INSTRUCT=AVX512"
+      echo ""
+      echo "  2) If you want best performance on this CPU:"
+      echo "       export CPUINFER_ENABLE_AMX=ON"
+      echo ""
+      read -p "Continue with risky configuration? (y/N) " -n 1 -r
      echo
      if [[ ! $REPLY =~ ^[Yy]$ ]]; then
        exit 1
@@ -271,12 +350,15 @@ export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
 export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
 export CPUINFER_VERBOSE=${CPUINFER_VERBOSE:-1}

+echo "=========================================="
 echo "Building kt-kernel with configuration:"
-echo "  CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
-echo "  CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
-echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
-echo "  CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
-echo "  CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
+echo "=========================================="
+echo "  CPUINFER_CPU_INSTRUCT        = $CPUINFER_CPU_INSTRUCT"
+echo "  CPUINFER_ENABLE_AMX          = $CPUINFER_ENABLE_AMX"
+echo "  CPUINFER_ENABLE_AVX512_VNNI  = ${CPUINFER_ENABLE_AVX512_VNNI:-AUTO}"
+echo "  CPUINFER_ENABLE_AVX512_BF16  = ${CPUINFER_ENABLE_AVX512_BF16:-AUTO}"
+echo "  CPUINFER_BUILD_TYPE          = $CPUINFER_BUILD_TYPE"
+echo "  CPUINFER_PARALLEL            = $CPUINFER_PARALLEL"
 echo ""

 pip install . -v
--- a/kt-kernel/operators/amx/k2-moe.hpp
+++ b/kt-kernel/operators/amx/k2-moe.hpp
@@ -94,10 +94,10 @@ class AMX_K2_MOE_TP {
  }
 #endif

-  inline void dump_buffer_b(const std::string &quantization_type, int expert_idx, const std::string &matrix_type,
-                            typename T::BufferB *buffer) {
-    auto &quant_config = config_.quant_config;
-    int &group_size = quant_config.group_size;
+  inline void dump_buffer_b(const std::string& quantization_type, int expert_idx, const std::string& matrix_type,
+                            typename T::BufferB* buffer) {
+    auto& quant_config = config_.quant_config;
+    int& group_size = quant_config.group_size;

    printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
           matrix_type.c_str());
@@ -110,7 +110,7 @@ class AMX_K2_MOE_TP {
      cols = config_.hidden_size;
      num_groups = cols / group_size;
      scale_elem_count = num_groups * rows;
-    } else { // down
+    } else {  // down
      rows = config_.hidden_size;
      cols = config_.intermediate_size;
      num_groups = cols / group_size;
@@ -133,8 +133,8 @@ class AMX_K2_MOE_TP {
      printf("\n");
    }
    // Dump quantized weights (as hex uint8)
-    size_t weight_size = (rows * cols) / 2; // INT4 packed
-    uint8_t *weight_ptr = (uint8_t *)buffer->b;
+    size_t weight_size = (rows * cols) / 2;  // INT4 packed
+    uint8_t* weight_ptr = (uint8_t*)buffer->b;

    printf("  Weights[first 32 bytes]: ");
    for (int i = 0; i < std::min(32, (int)weight_size); i++) {
@@ -232,10 +232,12 @@ class AMX_K2_MOE_TP {
    // (config_.expert_num * T::BufferA::M_STEP) in pool_count_ is to ensure padding for each experts.
    pool_count_ = config_.max_len * config_.num_experts_per_tok + config_.expert_num * T::BufferA::M_STEP;

-    gate_up_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
+    gate_up_ba_pool_bytes_ =
+        (T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
    gate_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
    up_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
-    down_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
+    down_ba_pool_bytes_ =
+        (T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
    down_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.hidden_size)) + pool_count_ * 64;

    mem_requests.append_pointer(&gate_up_ba_pool_, gate_up_ba_pool_bytes_);
@@ -276,8 +278,7 @@ class AMX_K2_MOE_TP {
              ith, nth);
          // up part
          up_bb_[expert_idx]->from_raw_mat(
-              (uint8_t*)config_.up_proj +
-                  ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
+              (uint8_t*)config_.up_proj + ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
              ith, nth);
        },
        nullptr);
@@ -302,19 +303,15 @@ class AMX_K2_MOE_TP {
        [this, physical_to_logical_map](int task_id) {
          uint64_t expert_idx = task_id;
          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
-          size_t scale_elem_count =
-              (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
+          size_t scale_elem_count = (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;

          // convert scales from BF16 to FP32
          convert_or_copy(gate_bb_[expert_idx]->d,
-                          (ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count),
-                          scale_elem_count);
+                          (ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
          convert_or_copy(up_bb_[expert_idx]->d,
-                          (ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count),
-                          scale_elem_count);
+                          (ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
          convert_or_copy(down_bb_[expert_idx]->d,
-                          (ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count),
-                          scale_elem_count);
+                          (ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
        },
        nullptr);
    // dump_buffer_b("native", 0, "down", down_bb_[0].get());
@@ -323,10 +320,10 @@ class AMX_K2_MOE_TP {
  // Reconstruct weights for all experts to the output buffers
  // This function handles the TP-specific portion of the reconstruction for all experts
  void write_weights_to_buffer(int gpu_tp_count, int cpu_tp_count, int num_experts, const GeneralMOEConfig& full_config,
-                                const std::vector<uintptr_t>& w13_weight_ptrs,
-                                const std::vector<uintptr_t>& w13_scale_ptrs,
-                                const std::vector<uintptr_t>& w2_weight_ptrs,
-                                const std::vector<uintptr_t>& w2_scale_ptrs) const {
+                               const std::vector<uintptr_t>& w13_weight_ptrs,
+                               const std::vector<uintptr_t>& w13_scale_ptrs,
+                               const std::vector<uintptr_t>& w2_weight_ptrs,
+                               const std::vector<uintptr_t>& w2_scale_ptrs) const {
    const int group_size = config_.quant_config.group_size;
    auto pool = config_.pool->get_subpool(tp_part_idx);

@@ -379,18 +376,19 @@ class AMX_K2_MOE_TP {
            // Gate (first part of w13 for this expert)
            uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b;
            float* gate_scale_src = gate_bb_[expert_id]->d;
-            std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight,
-                       gate_weight_src, cpu_tp_weight_bytes);
-            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale),
-                           gate_scale_src, cpu_tp_scale_elem_count);
+            std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight, gate_weight_src,
+                        cpu_tp_weight_bytes);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale), gate_scale_src,
+                            cpu_tp_scale_elem_count);

            // Up (second part of w13 for this expert, immediately after gate)
            uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b;
            float* up_scale_src = up_bb_[expert_id]->d;
            std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight + gpu_tp_weight_bytes,
-                       up_weight_src, cpu_tp_weight_bytes);
-            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
-                           up_scale_src, cpu_tp_scale_elem_count);
+                        up_weight_src, cpu_tp_weight_bytes);
+            convert_or_copy(
+                (ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
+                up_scale_src, cpu_tp_scale_elem_count);

            // Down (w2) - need to handle column-wise slicing
            // The down matrix is transposed compared to gate/up, so we need to extract by columns
@@ -406,17 +404,16 @@ class AMX_K2_MOE_TP {
              size_t gpu_col_slice_offset = local_idx * (config_.intermediate_size >> 1);

              std::memcpy(w2_weight_dst + w2_expert_base_weight + gpu_col_offset + gpu_col_slice_offset,
-                         (uint8_t*)down_bb_[expert_id]->b + cpu_col_offset,
-                         config_.intermediate_size / 2);
+                          (uint8_t*)down_bb_[expert_id]->b + cpu_col_offset, config_.intermediate_size / 2);

              // Same for scales
              size_t gpu_scale_col_offset = col * ((full_config.intermediate_size / gpu_tp_count) / group_size);
              size_t cpu_scale_col_offset = col * (config_.intermediate_size / group_size);
              size_t gpu_scale_slice_offset = local_idx * (config_.intermediate_size / group_size);

-              convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
-                             down_bb_[expert_id]->d + cpu_scale_col_offset,
-                             config_.intermediate_size / group_size);
+              convert_or_copy(
+                  (ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
+                  down_bb_[expert_id]->d + cpu_scale_col_offset, config_.intermediate_size / group_size);
            }
          },
          nullptr);
@@ -460,16 +457,15 @@ class AMX_K2_MOE_TP {
            // Gate (first part of w13 for this expert)
            uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b + cpu_offset_weight;
            float* gate_scale_src = gate_bb_[expert_id]->d + cpu_offset_scale;
-            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight,
-                        gate_weight_src, data_per_gpu_tp_weight);
-            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale),
-                            gate_scale_src, data_per_gpu_tp_scale);
+            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight, gate_weight_src, data_per_gpu_tp_weight);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale), gate_scale_src,
+                            data_per_gpu_tp_scale);

            // Up (second part of w13 for this expert, immediately after gate)
            uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b + cpu_offset_weight;
            float* up_scale_src = up_bb_[expert_id]->d + cpu_offset_scale;
-            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes,
-                        up_weight_src, data_per_gpu_tp_weight);
+            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes, up_weight_src,
+                        data_per_gpu_tp_weight);
            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale + gpu_tp_scale_elem_count),
                            up_scale_src, data_per_gpu_tp_scale);

@@ -477,16 +473,20 @@ class AMX_K2_MOE_TP {
            // The down matrix is transposed compared to gate/up, so we need to extract by columns
            for (size_t col = 0; col < config_.hidden_size; col++) {
              // Calculate the offset within the column for this GPU TP part
-              size_t col_offset_weight = (col * config_.intermediate_size / 2) + (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
-              size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) + (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
+              size_t col_offset_weight = (col * config_.intermediate_size / 2) +
+                                         (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
+              size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) +
+                                        (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);

              // Copy weights column by column
-              std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight + (col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
+              std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight +
+                              (col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
                          (uint8_t*)down_bb_[expert_id]->b + col_offset_weight,
                          (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2);

              // Copy scales column by column
-              convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale + col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
+              convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale +
+                                             col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
                              down_bb_[expert_id]->d + col_offset_scale,
                              (config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size);
            }
@@ -587,8 +587,7 @@ class AMX_K2_MOE_TP {
      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
      offset += m_local_num_[i];

-      if (m_local_num_[i] == 0)
-        continue;
+      if (m_local_num_[i] == 0) continue;
      size_t max_m = (m_local_num_[i] + M_STEP - 1) / M_STEP * M_STEP;
      gate_up_ba_[i]->max_m = max_m;
      gate_up_ba_[i]->set_data(gate_up_ba_pool_ptr);
@@ -801,7 +800,8 @@ class AMX_K2_MOE_TP {
        down_time, weight_time, forward_total_time, max_local_num, qlen);
 #endif
    // for (int i = 0; i < qlen; i ++)
-    //   forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size, (float*)output + i * config_.hidden_size);
+    //   forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size,
+    //   (float*)output + i * config_.hidden_size);
  }

  void forward_decode(int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
@@ -826,7 +826,7 @@ class AMX_K2_MOE_TP {
      m_expert_id_map_[activated_expert] = expert_ids[i];
      activated_expert++;
    }
-    
+
    size_t offset = 0;
    for (int i = 0; i < activated_expert; i++) {
      auto expert_idx = m_expert_id_map_[i];
@@ -912,9 +912,9 @@ class AMX_K2_MOE_TP {
            amx::vec_mul_kgroup(qlen, config_.intermediate_size, config_.hidden_size, group_size, gate_up_ba_[0],
                                gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth);
            gate_bc_[expert_idx]->to_mat(qlen, m_local_gate_output_ptr_[expert_idx], ith, nth);
-      }
-    },
-    nullptr);
+          }
+        },
+        nullptr);

 #ifdef DEBUG_K2_MOE
    if (activated_expert > 0) {
@@ -971,7 +971,6 @@ class AMX_K2_MOE_TP {
      }
    }

-    
 #ifdef FORWARD_TIME_PROFILE
    {
      auto now_time = std::chrono::high_resolution_clock::now();
@@ -1056,9 +1055,9 @@ class AMX_K2_MOE_TP {
        }
        __m512 weight = _mm512_set1_ps(weights[j]);
        __m512 down_output0, down_output1;
-        avx512_32xbf16_to_32xfp32((__m512i*)(m_local_down_output_ptr_[expert_ids[j]] +
-                                              m_local_pos_[0][j] * config_.hidden_size + e),
-                                  &down_output0, &down_output1);
+        avx512_32xbf16_to_32xfp32(
+            (__m512i*)(m_local_down_output_ptr_[expert_ids[j]] + m_local_pos_[0][j] * config_.hidden_size + e),
+            &down_output0, &down_output1);
        x0 = _mm512_fmadd_ps(down_output0, weight, x0);
        x1 = _mm512_fmadd_ps(down_output1, weight, x1);
      }
@@ -1151,28 +1150,26 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {

              // TP-slicing for gate and up (row-major slicing)
              memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
-                     src_gate + ((i * weight_elem_count) >> 1),
-                     (weight_elem_count >> 1));
+                     src_gate + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));

              memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
-                     src_up + ((i * weight_elem_count) >> 1),
-                     (weight_elem_count >> 1));
+                     src_up + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));

              memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
-                     src_gate_scale + (i * scales_elem_count),
-                     sizeof(ggml_bf16_t) * scales_elem_count);
+                     src_gate_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);

              memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
-                     src_up_scale + (i * scales_elem_count),
-                     sizeof(ggml_bf16_t) * scales_elem_count);
+                     src_up_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);

              // TP-slicing for down (by column)
              for (size_t col = 0; col < config.hidden_size; col++) {
                memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
                       src_down + ((col * config.intermediate_size + i * tpc.intermediate_size) >> 1),
                       (tpc.intermediate_size >> 1));
-                memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
-                       src_down_scale + (col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
+                memcpy((ggml_bf16_t*)tpc.down_scale +
+                           (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
+                       src_down_scale +
+                           (col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
                       sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
              }
            },
@@ -1197,43 +1194,45 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
        if (tps[i]->config_.load == false) {
          pool->get_subpool(i)->do_work_stealing_job(
              tpc.expert_num, nullptr,
-              [&](int expert_id_) { // weight and scale are all in col majored.
+              [&](int expert_id_) {  // weight and scale are all in col majored.
                size_t expert_id = expert_map(physical_to_logical_map, expert_id_);

                // weight and scale TP-slicing for gate and up
                memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
-                        (uint8_t*)config.gate_proj +
-                            ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
-                        ((sizeof(uint8_t) * weight_elem_count) >> 1));
+                       (uint8_t*)config.gate_proj +
+                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
+                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
-                        (uint8_t*)config.up_proj +
-                            ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
-                        ((sizeof(uint8_t) * weight_elem_count) >> 1));
+                       (uint8_t*)config.up_proj +
+                           ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
+                       ((sizeof(uint8_t) * weight_elem_count) >> 1));

                memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
-                        (ggml_bf16_t*)config.gate_scale +
-                            (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
+                       (ggml_bf16_t*)config.gate_scale +
+                           (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
                            i * scales_elem_count),
-                          sizeof(ggml_bf16_t) * scales_elem_count);
+                       sizeof(ggml_bf16_t) * scales_elem_count);

                memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
-                        (ggml_bf16_t*)config.up_scale +
-                            (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
+                       (ggml_bf16_t*)config.up_scale +
+                           (expert_id * (config.hidden_size / group_size) * config.intermediate_size +
                            i * scales_elem_count),
-                          sizeof(ggml_bf16_t) * scales_elem_count);
+                       sizeof(ggml_bf16_t) * scales_elem_count);

                // weight and scale TP-slicing for down (by column)
                for (size_t col = 0; col < config.hidden_size; col++) {
                  memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
-                          (uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
+                         (uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
                                                        col * config.intermediate_size + i * tpc.intermediate_size) >>
-                                                        1),
-                          (sizeof(uint8_t) * tpc.intermediate_size) >> 1);
-                  memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
-                          (ggml_bf16_t*)config.down_scale + ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
-                                                              col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
-                          sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
+                                                       1),
+                         (sizeof(uint8_t) * tpc.intermediate_size) >> 1);
+                  memcpy((ggml_bf16_t*)tpc.down_scale +
+                             (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
+                         (ggml_bf16_t*)config.down_scale +
+                             ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
+                              col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
+                         sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
                }
              },
              nullptr);
@@ -1245,7 +1244,8 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
 #ifdef LOAD_TIME_PROFILE
    {
      auto load_now_time = std::chrono::high_resolution_clock::now();
-      alloc_and_tp_slice_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
+      alloc_and_tp_slice_time =
+          std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
      load_last = load_now_time;
    }
 #endif
@@ -1277,9 +1277,11 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
      cleanup_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
    }
    auto load_end_time = std::chrono::high_resolution_clock::now();
-    auto load_total_time = std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
+    auto load_total_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
    printf(
-        "[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, total: %ld us\n",
+        "[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, "
+        "total: %ld us\n",
        tp_count, alloc_and_tp_slice_time, tps_load_time, cleanup_time, load_total_time);
 #endif

@@ -1307,15 +1309,13 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
    auto& config = this->config;
    auto pool = config.pool;
    // Each TP part writes to its corresponding buffer
-    pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num,
-      w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
+    pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num, w13_weight_ptrs, w13_scale_ptrs,
+                                           w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
      // Note: w13 combines gate and up projections
      // Split w13 pointers for gate and up
-      this->tps[numa_id]->write_weights_to_buffer(
-          gpu_tp_count, this->tp_count,
-          gpu_experts_num, this->config,
-          w13_weight_ptrs, w13_scale_ptrs, //gate + up use w13
-          w2_weight_ptrs, w2_scale_ptrs);    // down uses w2
+      this->tps[numa_id]->write_weights_to_buffer(gpu_tp_count, this->tp_count, gpu_experts_num, this->config,
+                                                  w13_weight_ptrs, w13_scale_ptrs,  // gate + up use w13
+                                                  w2_weight_ptrs, w2_scale_ptrs);   // down uses w2
    });
  }

--- a/kt-kernel/operators/amx/la/amx_buffers.hpp
+++ b/kt-kernel/operators/amx/la/amx_buffers.hpp
@@ -350,10 +350,7 @@ struct BufferAKGroupImpl {
    return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size);
  }

-  BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
-      : max_m(max_m),
-        k(k),
-        k_group_size(k_group_size) {
+  BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : max_m(max_m), k(k), k_group_size(k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    ASSERT_RELEASE(max_m % M_STEP == 0, "max_m must be multiple of M_STEP");
    ASSERT_RELEASE(k % K_STEP == 0, "k must be multiple of K_STEP");
@@ -459,8 +456,7 @@ struct BufferASmallKGroupImpl : public BufferAKGroupImpl<K> {
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

-  BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
-      : Base(max_m, k, k_group_size, ptr) {}
+  BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : Base(max_m, k, k_group_size, ptr) {}

  // Override from_mat to write only 32 bytes per K_STEP iteration
  void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
@@ -991,8 +987,8 @@ struct BufferBInt4WithZeroImpl {
 template <typename K>
 struct BufferBInt4KGroupImpl {
  using dt = typename K::dt;
-  dt* b;      // packed signed int4 weights, col majored
-  float* d;   // scales only (no mins/zero-points), row majored
+  dt* b;     // packed signed int4 weights, col majored
+  float* d;  // scales only (no mins/zero-points), row majored
  int n, k, k_group_size, k_group_count;

  static constexpr int N_STEP = K::N_STEP;
@@ -1009,8 +1005,8 @@ struct BufferBInt4KGroupImpl {
    assert(n % N_STEP == 0);
    assert(k % K_STEP == 0);
    if (n % N_STEP || k % K_STEP || k % k_group_size) {
-      printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP,
-             K_STEP, k_group_size);
+      printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, K_STEP,
+             k_group_size);
      throw std::runtime_error("n or k is not aligned to N_STEP or K_STEP");
    }
    k_group_count = k / k_group_size;
@@ -1043,8 +1039,8 @@ struct BufferBInt4KGroupImpl {

  // Get scale pointer for a specific row and k_group
  float* get_scale(int n, int n_begin, int k, int k_begin) {
-      int k_group_idx = k_begin / k_group_size;
-      return d + n_begin * (k / k_group_size) +  k_group_idx;
+    int k_group_idx = k_begin / k_group_size;
+    return d + n_begin * (k / k_group_size) + k_group_idx;
  }

  // Split range for parallel processing
--- a/kt-kernel/operators/amx/la/amx_kernels.hpp
+++ b/kt-kernel/operators/amx/la/amx_kernels.hpp
@@ -1552,9 +1552,9 @@ struct GemmKernel224Int4_1 {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
@@ -2491,7 +2491,7 @@ struct GemmKernel224Int4_1KGroup {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
          }
        }
      }
@@ -2503,7 +2503,7 @@ struct GemmKernel224Int4_1KGroup {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
@@ -2767,7 +2767,7 @@ struct GemmKernel224Int4_1_LowKGroup {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_lo = _mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
          }
        }
      }
@@ -2779,7 +2779,7 @@ struct GemmKernel224Int4_1_LowKGroup {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
            __m512i b512_hi = _mm512_srli_epi32(_mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]), 4);
-            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
+            c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
          }
        }
      }
@@ -2850,7 +2850,7 @@ struct GemmKernel224Int4SmallKGroup {
  using output_t = int32_t;
  static constexpr double ELEMENT_SIZE = 0.5;
  static const int VNNI_BLK = 4;
-  
+
  static const int M_STEP = 1;
  static const int N_STEP = 32;
  static const int K_STEP = 32;
@@ -2870,18 +2870,15 @@ struct GemmKernel224Int4SmallKGroup {

  alignas(64) static constexpr uint8_t hi_mask_arr[32] = {
      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
-      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0
-    };
+      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};

  alignas(64) static constexpr uint8_t lo_mask_arr[32] = {
      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
-      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
-    };
-  
+      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};
+
  alignas(64) static constexpr uint8_t sign_xor_arr[32] = {
      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
-      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88
-    };
+      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88};
  static __m256i hi_mask() { return *((__m256i*)(&hi_mask_arr[0])); }
  static __m256i lo_mask() { return *((__m256i*)(&lo_mask_arr[0])); }
  static __m256i sign_xor_mask() { return *((__m256i*)(&sign_xor_arr[0])); }
@@ -2902,27 +2899,28 @@ struct GemmKernel224Int4SmallKGroup {
    const __m512i lane_shuffle = _mm512_set_epi64(7, 6, 3, 2, 5, 4, 1, 0);
    return _mm512_permutexvar_epi64(lane_shuffle, result);
  }
-  static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB *bb, BufferC* bc, int ith, int nth) {
+  static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB* bb,
+                                            BufferC* bc, int ith, int nth) {
    auto [n_start, n_end] = split_range_n(n, ith, nth);
-    for (int m_begin = 0; m_begin < m; m_begin ++) {
+    for (int m_begin = 0; m_begin < m; m_begin++) {
      float* c = bc->get_submat(m, n, m_begin, n_start);
      __m512i* a512 = (__m512i*)ba->get_submat(m, k, m_begin, 0);
-      
-      for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin ++) {
+
+      for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin++) {
        __m256i* b256 = (__m256i*)bb->get_submat(n, k, n_block_begin, 0);
        float* as = (float*)ba->get_scale(m, m_begin, k, 0);
        float* bs = (float*)bb->get_scale(n, n_block_begin, k, 0);
-        
+
        __m512 sum = _mm512_setzero_ps();
-        #define WORK_K_BLOCK(k_block) \
-          { \
-            __m256 abscale0 = _mm256_set1_ps(as[(k_block)*2] * bs[(k_block)*2]); \
-            __m256 abscale1 = _mm256_set1_ps(as[(k_block)*2+1] * bs[(k_block)*2+1]); \
-            __m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
-            __m512i mul = _mm512_setzero_si512(); \
-            mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
-            sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
-          }
+#define WORK_K_BLOCK(k_block)                                                                     \
+  {                                                                                               \
+    __m256 abscale0 = _mm256_set1_ps(as[(k_block) * 2] * bs[(k_block) * 2]);                      \
+    __m256 abscale1 = _mm256_set1_ps(as[(k_block) * 2 + 1] * bs[(k_block) * 2 + 1]);              \
+    __m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1);           \
+    __m512i mul = _mm512_setzero_si512();                                                         \
+    mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
+    sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul)));                    \
+  }

        for (int k_block = 0; k_block < k / 64; k_block += 2) {
          WORK_K_BLOCK(k_block);
@@ -2935,13 +2933,15 @@ struct GemmKernel224Int4SmallKGroup {
  }
 };

-inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
+inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
 }

-inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
+inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
--- a/kt-kernel/operators/amx/la/amx_quantization.hpp
+++ b/kt-kernel/operators/amx/la/amx_quantization.hpp
@@ -211,26 +211,44 @@ inline __m256i merge_q8K_bsum(block_q8_K* b) {
  return _mm256_madd_epi16(_mm256_loadu_si256((__m256i*)b->bsums), _mm256_set1_epi16(1));
 }

+inline __m512i _mm512_dpbusd_epi32_compat(__m512i src, __m512i a, __m512i b) {
+#if defined(__AVX512VNNI__)
+  return _mm512_dpbusd_epi32(src, a, b);
+#else
+  const __m512i mask_lo = _mm512_set1_epi16(0x00FF);
+  const __m512i ones16 = _mm512_set1_epi16(1);
+
+  __m512i a_even = _mm512_and_si512(a, mask_lo);
+  __m512i b_even = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);
+
+  __m512i a_odd = _mm512_srli_epi16(a, 8);
+  __m512i b_odd = _mm512_srai_epi16(b, 8);
+
+  __m512i prod_even = _mm512_mullo_epi16(a_even, b_even);
+  __m512i prod_odd = _mm512_mullo_epi16(a_odd, b_odd);
+
+  __m512i sum_even = _mm512_madd_epi16(prod_even, ones16);
+  __m512i sum_odd = _mm512_madd_epi16(prod_odd, ones16);
+
+  return _mm512_add_epi32(src, _mm512_add_epi32(sum_even, sum_odd));
+#endif
+}
+
 inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
-  // 提取高低256-bit部分
  __m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
  __m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
  __m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
  __m256i b_hi = _mm512_extracti64x4_epi64(b, 1);

-  // 根据a的符号调整b的符号
  b_lo = _mm256_sign_epi8(b_lo, a_lo);
  b_hi = _mm256_sign_epi8(b_hi, a_hi);

-  // 将修改后的b重新组合
  b = _mm512_inserti64x4(b, b_lo, 0);
  b = _mm512_inserti64x4(b, b_hi, 1);

-  // 取绝对值
  a = _mm512_abs_epi8(a);

-  // 进行dot-product计算
-  return _mm512_dpbusd_epi32(src, a, b);
+  return _mm512_dpbusd_epi32_compat(src, a, b);
 }

 }  // namespace amx
--- a/kt-kernel/operators/amx/la/utils.hpp
+++ b/kt-kernel/operators/amx/la/utils.hpp
@@ -9,10 +9,42 @@ static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
  _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
 }

+// FP32 to BF16 conversion (32 floats -> 32 bf16)
+// This requires AVX512BF16 for the fast path, with a fallback for CPUs without it
 static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
+#if defined(HAVE_AVX512BF16) || defined(__AVX512BF16__)
+  // Fast path: use native AVX512BF16 instruction
  _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
+#else
+  // Fallback: manual BF16 conversion using bit manipulation
+  // BF16 is the upper 16 bits of FP32 (with rounding)
+  __m512i i0 = _mm512_castps_si512(*src0);
+  __m512i i1 = _mm512_castps_si512(*src1);
+
+  // Round to nearest even: add 0x7FFF + ((val >> 16) & 1)
+  __m512i round0 =
+      _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i0, 16), _mm512_set1_epi32(1)));
+  __m512i round1 =
+      _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i1, 16), _mm512_set1_epi32(1)));
+
+  i0 = _mm512_add_epi32(i0, round0);
+  i1 = _mm512_add_epi32(i1, round1);
+
+  // Extract upper 16 bits (BF16)
+  i0 = _mm512_srli_epi32(i0, 16);
+  i1 = _mm512_srli_epi32(i1, 16);
+
+  // Pack 32-bit values to 16-bit
+  __m512i result = _mm512_packus_epi32(i0, i1);
+  // Fix the interleaving from packus
+  result = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), result);
+
+  _mm512_storeu_si512(dst, result);
+#endif
 }

+// BF16 to FP32 conversion (32 bf16 -> 32 floats)
+// This does NOT require AVX512BF16 - uses basic AVX512 bit manipulation
 static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
  _mm512_storeu_ps(dst0, _mm512_castsi512_ps(
                             _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)(src))), 16)));
--- a/kt-kernel/setup.py
+++ b/kt-kernel/setup.py
@@ -194,7 +194,7 @@ class CMakeBuild(build_ext):
                info["raw"]["flags"] = flags

                # feature summary
-                if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl", "avx512vnni"]):
+                if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl"]):
                    info["features"].add("AVX512")
                if "avx2" in flags or "avx2" in low:
                    info["features"].add("AVX2")
@@ -205,6 +205,16 @@ class CMakeBuild(build_ext):
                ):
                    info["features"].add("AMX")

+                # Fine-grained AVX512 subset detection
+                if any(f in flags for f in ["avx512_vnni", "avx512vnni"]):
+                    info["features"].add("AVX512_VNNI")
+                if any(f in flags for f in ["avx512_bf16", "avx512bf16"]):
+                    info["features"].add("AVX512_BF16")
+                if any(f in flags for f in ["avx512_vbmi", "avx512vbmi"]):
+                    info["features"].add("AVX512_VBMI")
+                if any(f in flags for f in ["avx512_vpopcntdq", "avx512vpopcntdq"]):
+                    info["features"].add("AVX512_VPOPCNTDQ")
+
            elif sysname == "Darwin":
                # macOS: Apple Silicon (arm64) vs Intel
                arch = platform.machine().lower()
@@ -229,133 +239,6 @@ class CMakeBuild(build_ext):
        return info

    def build_extension(self, ext: CMakeExtension):
-        """
-        Main entry point for building the extension.
-
-        Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
-        and routes to the appropriate build method.
-        """
-        if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
-            # Build all 3 variants (AMX, AVX512, AVX2)
-            self.build_multi_variants(ext)
-        else:
-            # Build single variant (original behavior)
-            self._build_single_variant(ext)
-
-    def build_multi_variants(self, ext: CMakeExtension):
-        """
-        Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel.
-
-        This method is called when CPUINFER_BUILD_ALL_VARIANTS=1 is set.
-        It builds three separate extensions with different CPU instruction sets
-        and renames the output .so files with variant suffixes.
-        """
-        print("=" * 80)
-        print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)")
-        print("=" * 80)
-
-        # Define the 3 variants to build
-        variants = [
-            {
-                'name': 'amx',
-                'env': {
-                    'CPUINFER_CPU_INSTRUCT': 'NATIVE',
-                    'CPUINFER_ENABLE_AMX': 'ON',
-                },
-                'description': 'AMX variant (Intel Sapphire Rapids+)'
-            },
-            {
-                'name': 'avx512',
-                'env': {
-                    'CPUINFER_CPU_INSTRUCT': 'AVX512',
-                    'CPUINFER_ENABLE_AMX': 'OFF',
-                },
-                'description': 'AVX512 variant (Intel Skylake-X/Ice Lake/Cascade Lake)'
-            },
-            {
-                'name': 'avx2',
-                'env': {
-                    'CPUINFER_CPU_INSTRUCT': 'AVX2',
-                    'CPUINFER_ENABLE_AMX': 'OFF',
-                },
-                'description': 'AVX2 variant (maximum compatibility)'
-            }
-        ]
-
-        # Save original environment
-        original_env = os.environ.copy()
-
-        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
-
-        for i, variant in enumerate(variants, 1):
-            print(f"\n{'=' * 80}")
-            print(f"Building variant {i}/3: {variant['description']}")
-            print(f"{'=' * 80}\n")
-
-            # Set variant-specific environment variables
-            os.environ.update(variant['env'])
-
-            # Use a unique build directory for this variant
-            original_build_temp = self.build_temp
-            self.build_temp = str(Path(self.build_temp) / f"variant_{variant['name']}")
-
-            try:
-                # Build this variant (calls the single-variant build logic)
-                self._build_single_variant(ext)
-
-                # Rename the generated .so file to include variant suffix
-                # Original: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
-                # Renamed:  _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
-
-                # Extract the base extension name (without package prefix)
-                # ext.name is "kt_kernel.kt_kernel_ext", we want "kt_kernel_ext"
-                base_ext_name = ext.name.split('.')[-1]
-
-                # Find the newly built .so file
-                import time
-                time.sleep(0.5)  # Give filesystem time to sync
-
-                built_candidates = [
-                    f for f in Path(extdir).glob("*.so")
-                    if f.name.startswith(base_ext_name) and not f.name.startswith(f"_{base_ext_name}_")
-                ]
-
-                if not built_candidates:
-                    print(f"WARNING: No .so file found for {base_ext_name} in {extdir}")
-                    print(f"Files in {extdir}:")
-                    for f in Path(extdir).glob("*.so"):
-                        print(f"  {f.name}")
-
-                for so_file in built_candidates:
-                    # Extract the python tag part (e.g., ".cpython-311-x86_64-linux-gnu.so")
-                    suffix = so_file.name.replace(base_ext_name, "")
-                    new_name = f"_{base_ext_name}_{variant['name']}{suffix}"
-                    new_path = extdir / new_name
-
-                    print(f"-- Renaming {so_file.name} -> {new_name}")
-                    if new_path.exists():
-                        print(f"   WARNING: Target file already exists, removing: {new_path}")
-                        new_path.unlink()
-                    so_file.rename(new_path)
-                    print(f"   ✓ Successfully renamed to {new_name}")
-
-            finally:
-                # Restore build_temp for next iteration
-                self.build_temp = original_build_temp
-
-        # Restore original environment
-        os.environ.clear()
-        os.environ.update(original_env)
-
-        print(f"\n{'=' * 80}")
-        print("✓ Successfully built all 3 CPU variants")
-        print(f"{'=' * 80}\n")
-
-    def _build_single_variant(self, ext: CMakeExtension):
-        """
-        Build a single CPU variant. This contains the core build logic
-        extracted from the original build_extension method.
-        """
        # Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
        def detect_cuda_toolkit() -> bool:
            # Respect CUDA_HOME
@@ -403,10 +286,6 @@ class CMakeBuild(build_ext):
            auto_cuda = detect_cuda_toolkit()
            os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
            print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
-        elif cuda_env:
-            print("-- CPUINFER_USE_CUDA explicitly enabled")
-        else:
-            print("-- CPUINFER_USE_CUDA explicitly disabled")

        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
        cfg = default_build_type()
@@ -461,6 +340,17 @@ class CMakeBuild(build_ext):
            else:
                print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella")

+        # Fine-grained AVX512 subset flags: only enable if CPU actually supports them
+        # These are passed to CMake to conditionally add compiler flags
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VNNI", "LLAMA_AVX512_VNNI"):
+            if "AVX512_VNNI" in d["features"]:
+                cmake_args.append("-DLLAMA_AVX512_VNNI=ON")
+                print("-- AVX512_VNNI detected; enabling (-DLLAMA_AVX512_VNNI=ON)")
+        if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_BF16", "LLAMA_AVX512_BF16"):
+            if "AVX512_BF16" in d["features"]:
+                cmake_args.append("-DLLAMA_AVX512_BF16=ON")
+                print("-- AVX512_BF16 detected; enabling (-DLLAMA_AVX512_BF16=ON)")
+
        # Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
        # (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
        amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None)
@@ -562,6 +452,7 @@ class CMakeBuild(build_ext):
 # Version (simple). If you later add a python package dir, you can read from it.
 ################################################################################

+
 # Import version from shared version.py at project root
 _version_file = Path(__file__).resolve().parent.parent / "version.py"
 if _version_file.exists():