Fix CPU Instruction Set and Installation (#1729)

* [fix](kt-kernel): fix AVX512 cpu instruction set detection

* [feat](kt-kernel): AVX512 fallback kernel for RAW-INT4

* [fix](kt-kernel): fix setup version issue

* [fix](kt-kernel): update install for custom build

* [docs](kt-kernel): new installation guide for various cpu instruction set

* [fix](kt-kernel): fix _mm512_dpbusd_epi32_compat fallback implmentation

* [style](kt-kernel): clang format
This commit is contained in:
Jiaqi Liao
2025-12-18 00:11:57 +08:00
committed by GitHub
parent a8667ddb58
commit 3c134359bc
11 changed files with 545 additions and 478 deletions

View File

@@ -28,7 +28,7 @@ option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF
# LTO control
option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)
project(kt_kernel_ext VERSION 0.4.2)
project(kt_kernel_ext VERSION 0.4.4)
# Choose compilers BEFORE project() so CMake honors them
if(USE_CONDA_TOOLCHAIN)
if(NOT DEFINED ENV{CONDA_PREFIX} OR NOT EXISTS "$ENV{CONDA_PREFIX}")
@@ -378,7 +378,20 @@ if(HOST_IS_X86)
target_link_libraries(${test_name} llama OpenMP::OpenMP_CXX numa)
endforeach()
endif()
list(APPEND ARCH_FLAGS -mfma -mf16c -mavx512bf16 -mavx512vnni)
# Note: AVX512 subset flags (-mavx512vnni, -mavx512bf16) are already added
# in the generic x86 detection block above (lines 276-289) when corresponding
# LLAMA_AVX512_* options are enabled. No need to add them again here.
# -mfma is already added by LLAMA_NATIVE (line 254), LLAMA_AVX*, or LLAMA_FMA blocks.
# Only add -mf16c if LLAMA_F16C is not already enabled.
if(NOT LLAMA_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if(LLAMA_AVX512_VNNI)
message(STATUS "AVX512_VNNI enabled")
endif()
if(LLAMA_AVX512_BF16)
message(STATUS "AVX512_BF16 enabled")
endif()
endif()
endif()

View File

@@ -37,6 +37,7 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
-**Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
-**Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
-**AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
-**Kimi-K2 Native INT4 (RAWINT4)**: Supported on AVX512 CPUs (CPU-GPU shared INT4 weights) - [Guide](../doc/en/Kimi-K2-Thinking-Native.md)
## Features
@@ -49,6 +50,8 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
### Option 1: Install from PyPI (Recommended for Most Users)
Coming soon...
Choose the version matching your CUDA installation:
```bash
@@ -104,76 +107,55 @@ python -c "import kt_kernel"
---
### Option 2: Install from Source (For AMD, ARM, or Custom Builds)
### Option 2: Install from Source (For Local Use or Custom Builds)
If you need AMD (BLIS), ARM (KML), or custom CUDA versions, build from source:
Build from source for local installation or when you need AMD (BLIS), ARM (KML), or custom CUDA versions.
#### Prerequisites
First, initialize git submodules:
First, initialize git submodules and create a conda environment:
```bash
git submodule update --init --recursive
```
#### Quick Installation
Step 0: Create and activate a conda environment (recommended):
```bash
conda create -n kt-kernel python=3.11 -y
conda activate kt-kernel
```
You can now install in two clear steps using the same script.
#### Quick Installation (Recommended)
**Option A: Two-step** (specify dependencies installation and build separately)
```bash
# 1) Install system prerequisites (cmake, hwloc, pkg-config)
./install.sh deps
# 2) Build and install kt-kernel (auto-detects CPU instruction set)
# By default, the script cleans the local ./build directory before compiling
./install.sh build
```
**Option B: One-step**
Simply run the install script - it will auto-detect your CPU and optimize for best performance:
```bash
./install.sh
```
The install script will:
- Auto-detect CPU capabilities (AMX support)
- Install `cmake` via conda (if available)
- Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
**What gets configured automatically:**
- AMX CPU detected → `NATIVE + AMX=ON`
- No AMX detected → `NATIVE + AMX=OFF`
⚠️ **Important for LLAMAFILE backend users:**
If you have an AMX-capable CPU but plan to use the LLAMAFILE backend, do NOT use the default auto-detection build.
Use "manual mode" with `CPUINFER_CPU_INSTRUCT` set to `AVX512` or `AVX2` instead of `NATIVE` to avoid compilation issues (see below).
⚠️ **Important for BLIS AMD backend users:**
for the installation guide, see this [issue](https://github.com/kvcache-ai/ktransformers/issues/1601)
### Manual Configuration (Advanced)
If you need specific build options (e.g., for LLAMAFILE backend, compatibility, or binary distribution):
**What happens automatically:**
- Auto-detects CPU capabilities (AMX, AVX512_VNNI, AVX512_BF16)
- Installs system dependencies (`cmake`, `libhwloc-dev`, `pkg-config`)
- Builds optimized binary for **your CPU only** (using `-march=native`)
- **Software fallbacks**: Automatically enabled for CPUs without VNNI/BF16
**Optional: Two-step installation**
```bash
# Example for LLAMAFILE backend on AMX CPU with AVX512
export CPUINFER_CPU_INSTRUCT=AVX512 # Options: NATIVE, AVX512, AVX2, FANCY
export CPUINFER_ENABLE_AMX=OFF # Options: ON, OFF
# Build only (skip auto-detection of instruction set)
./install.sh build --manual
./install.sh deps # Install dependencies only
./install.sh build # Build and install kt-kernel
```
For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
**CPU Requirements by Backend:**
| Backend | Minimum CPU Requirement | Example CPUs | Notes |
|---------|-------------------------|--------------|-------|
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+), AMD Zen+ | Universal compatibility |
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+), Ice Lake, Cascade Lake | Software fallbacks for VNNI/BF16 |
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | Best performance, requires AMX hardware |
**Software Fallback Support (AVX512 backends):**
- ✅ VNNI fallback: Uses AVX512BW instructions
- ✅ BF16 fallback: Uses AVX512F instructions
- ✅ Older AVX512 CPUs (Skylake-X, Cascade Lake) can run RAWINT4 with fallbacks
⚠️ **Portability Note:** The default build is optimized for your specific CPU and may not work on different/older CPUs. For portable builds or binary distribution, see [Manual Configuration](#manual-configuration-advanced) below.
⚠️ **AMD BLIS backend users:** See [installation guide](https://github.com/kvcache-ai/ktransformers/issues/1601) for AMD-specific setup.
## Verification
@@ -482,11 +464,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
KTMoEWrapper.clear_buffer_cache()
```
### Manual Configuration (Advanced)
For portable builds, binary distribution, or cross-machine deployment, you need to manually specify target instruction sets:
```bash
# General distribution (works on any AVX512 CPU from 2017+)
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
# Maximum compatibility (works on any CPU from 2013+)
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
# Modern CPUs only (Ice Lake+, Zen 4+)
export CPUINFER_CPU_INSTRUCT=FANCY
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
```
**Optional: Override VNNI/BF16 detection**
```bash
# Force enable/disable VNNI and BF16 (for testing fallbacks)
export CPUINFER_ENABLE_AVX512_VNNI=OFF
export CPUINFER_ENABLE_AVX512_BF16=OFF
./install.sh
```
See `./install.sh --help` for all available options.
---
## Build Configuration
### Manual Installation
### Manual Installation (Without install.sh)
If you prefer manual installation without the `install.sh` script, follow these steps:
If you prefer manual installation without the `install.sh` script:
#### 1. Install System Dependencies
@@ -508,27 +523,29 @@ If you prefer manual installation without the `install.sh` script, follow these
**Instruction Set Details:**
- **`NATIVE`**: Auto-detect and use all available CPU instructions (`-march=native`) - **Recommended for best performance**
- **`AVX512`**: Explicit AVX512 support for Skylake-SP and Cascade Lake
- **`AVX2`**: AVX2 support for maximum compatibility
- **`FANCY`**: AVX512 with full extensions (AVX512F/BW/DQ/VL/VNNI) for Ice Lake+ and Zen 4+. Use this when building pre-compiled binaries to distribute to users with modern CPUs. For local builds, prefer `NATIVE` for better performance.
| Option | Target CPUs | Use Case |
|--------|-------------|----------|
| **`NATIVE`** | Your specific CPU only | Local builds (best performance, **default**) |
| **`AVX512`** | Skylake-X, Ice Lake, Cascade Lake, Zen 4+ | General distribution |
| **`AVX2`** | Haswell (2013) and newer | Maximum compatibility |
| **`FANCY`** | Ice Lake+, Zen 4+ | Modern CPUs with full AVX512 extensions |
**Example Configurations:**
```bash
# Maximum performance on AMX CPU
# Local use - maximum performance (default behavior)
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=ON
export CPUINFER_ENABLE_AMX=ON # or OFF
# AVX512 CPU without AMX
# Distribution build - works on any AVX512 CPU
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
# Compatibility build
# Maximum compatibility - works on CPUs since 2013
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
# Debug build for development
# Debug build
export CPUINFER_BUILD_TYPE=Debug
export CPUINFER_VERBOSE=1
```

View File

@@ -38,6 +38,7 @@
-**带 AMX 的 Intel CPU**:已支持(基于转换为 INT4/INT8 格式的权重)
-**通用 CPUllamafile 后端)**:已支持(基于 GGUF 格式的权重)
-**带 BLIS 的 AMD CPU**已支持int8 的 prefill 和 decode
-**Kimi-K2 原生 INT4RAWINT4**:支持 AVX512 CPUCPU-GPU 共享 INT4 权重)- [使用指南](../doc/en/Kimi-K2-Thinking-Native.md)
## 特性
@@ -49,69 +50,55 @@
## 安装
### 先决条件
### 从源码安装(本机使用或自定义构建)
首先初始化子模块:
适用于本地安装,或需要 AMD (BLIS)、ARM (KML) 或自定义 CUDA 版本的场景。
#### 先决条件
首先初始化子模块并创建 conda 环境:
```bash
git submodule update --init --recursive
```
### 快速安装(推荐)
第 0 步:创建并激活一个 conda 环境(推荐):
```bash
conda create -n kt-kernel python=3.11 -y
conda activate kt-kernel
```
随后可以用同一个脚本分两步或一步安装。
#### 快速安装(推荐)
方案 A两步可以指定依赖安装与编译构建
```bash
# 1安装系统依赖cmake, hwloc, pkg-config
./install.sh deps
# 2构建并安装 kt-kernel自动检测 CPU 指令集)
# 默认会在编译前清理本地 ./build 目录
./install.sh build
```
方案 B一步
只需运行安装脚本,它会自动检测 CPU 并优化性能:
```bash
./install.sh
```
安装脚本会:
- 自动检测 CPU 能力(是否支持 AMX
- 尝试通过 conda 安装 `cmake`(若可用
- 根据你的操作系统安装系统依赖(`libhwloc-dev``pkg-config`
**自动配置内容:**
- 检测到 AMX CPU → 使用 `NATIVE + AMX=ON`
- 未检测到 AMX → 使用 `NATIVE + AMX=OFF`
⚠️ **LLAMAFILE 后端用户特别说明:**
如果你有带 AMX 的 CPU但是计划使用 LLAMAFILE 后端,请不要使用默认的自动检测构建方式。
请使用“手动模式”,并将 `CPUINFER_CPU_INSTRUCT` 设为 `AVX512``AVX2` 而非 `NATIVE`,以避免编译期异常(见下文)。
### 手动配置(进阶)
如果你需要更精细的构建选项(例如为 LLAMAFILE 后端、兼容性或二进制分发配置):
**自动完成的操作:**
- 自动检测 CPU 能力(AMX、AVX512_VNNI、AVX512_BF16
- 安装系统依赖(`cmake``libhwloc-dev``pkg-config`
- 为**你的 CPU** 构建优化二进制(使用 `-march=native`
- **软件回退机制**:为不支持 VNNI/BF16 的 CPU 自动启用
**可选:分步安装**
```bash
# 在带 AMX 的 CPU 上构建 LLAMAFILE 后端的示例(使用 AVX512
export CPUINFER_CPU_INSTRUCT=AVX512 # 选项: NATIVE, AVX512, AVX2, FANCY
export CPUINFER_ENABLE_AMX=OFF # 选项: ON, OFF
# 仅构建(不进行指令集的自动检测)
./install.sh build --manual
./install.sh deps # 仅安装依赖
./install.sh build # 构建并安装 kt-kernel
```
更多构建选项和二进制分发配置,请参见 [构建配置](#构建配置) 一节。
如果遇到问题,可参考 [错误排查](#错误排查)。
**不同后端的 CPU 要求:**
| 后端 | 最低 CPU 要求 | 示例 CPU | 说明 |
|------|---------------|----------|------|
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+)、AMD Zen+ | 通用兼容性 |
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+)、Ice Lake、Cascade Lake | 支持 VNNI/BF16 软件回退 |
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | 最佳性能,需要 AMX 硬件 |
**软件回退支持AVX512 后端):**
- ✅ VNNI 回退:使用 AVX512BW 指令
- ✅ BF16 回退:使用 AVX512F 指令
- ✅ 老的 AVX512 CPUSkylake-X、Cascade Lake可以运行 RAWINT4使用回退
⚠️ **可移植性说明:** 默认构建针对你的特定 CPU 优化,可能无法在不同/更老的 CPU 上运行。如需打包分发或跨机器部署,请参见下方的 [手动配置](#手动配置进阶)。
⚠️ **AMD BLIS 后端用户:** 请参见 [安装指南](https://github.com/kvcache-ai/ktransformers/issues/1601) 了解 AMD 专用配置。
## 验证安装
@@ -421,11 +408,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
KTMoEWrapper.clear_buffer_cache()
```
### 手动配置(进阶)
如需打包分发、跨机器部署或构建可移植二进制,需要手动指定目标指令集:
```bash
# 通用分发版(适用于 2017+ 的任何 AVX512 CPU
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
# 最大兼容性(适用于 2013+ 的任何 CPU
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
# 仅限现代 CPUIce Lake+、Zen 4+
export CPUINFER_CPU_INSTRUCT=FANCY
export CPUINFER_ENABLE_AMX=OFF
./install.sh build --manual
```
**可选:覆盖 VNNI/BF16 检测**
```bash
# 强制启用/禁用 VNNI 和 BF16用于测试回退
export CPUINFER_ENABLE_AVX512_VNNI=OFF
export CPUINFER_ENABLE_AVX512_BF16=OFF
./install.sh
```
运行 `./install.sh --help` 查看所有可用选项。
---
## 构建配置
### 手动安装
### 手动安装(不使用 install.sh
如果你不想使用 `install.sh`,可以按以下步骤手动构建
如果你不想使用 `install.sh` 脚本
#### 1. 安装系统依赖
@@ -447,24 +467,25 @@ KTMoEWrapper.clear_buffer_cache()
**指令集说明:**
- **`NATIVE`**:自动检测并启用所有可用 CPU 指令(`-march=native`)——**本机运行时首选**
- **`AVX512`**:为 Skylake-SP / Cascade Lake 显式开启 AVX512
- **`AVX2`**:开启 AVX2兼容性较好
- **`FANCY`**:开启完整 AVX512 扩展AVX512F/BW/DQ/VL/VNNI适用于 Ice Lake+ 和 Zen 4+ 等较新平台。
用于向用户分发预编译二进制时推荐;本地构建推荐使用 `NATIVE` 以获得更优性能。
| 选项 | 目标 CPU | 使用场景 |
|------|----------|----------|
| **`NATIVE`** | 仅限你的特定 CPU | 本地构建(最佳性能,**默认** |
| **`AVX512`** | Skylake-X、Ice Lake、Cascade Lake、Zen 4+ | 通用分发 |
| **`AVX2`** | Haswell (2013) 及更新 | 最大兼容性 |
| **`FANCY`** | Ice Lake+、Zen 4+ | 具有完整 AVX512 扩展的现代 CPU |
**配置示例:**
```bash
# 在 AMX CPU 上获得最高性能
# 本地使用 - 最高性能(默认行为)
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=ON
export CPUINFER_ENABLE_AMX=ON # 或 OFF
# AVX512,无 AMX
# 分发构建 - 适用于任何 AVX512 CPU
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
# 兼容性优先构建
# 最大兼容性 - 适用于 2013 年以来的 CPU
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF

View File

@@ -229,8 +229,7 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
auto moe_cls = py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name);
moe_cls
.def(py::init<GeneralMOEConfig>())
moe_cls.def(py::init<GeneralMOEConfig>())
.def("warm_up_task", &MoeBindings::WarmUpBindings::cpuinfer_interface)
.def("load_weights_task",
py::overload_cast<std::shared_ptr<MoeClass>>(&MoeBindings::LoadWeightsBindings::cpuinfer_interface))
@@ -265,16 +264,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
static void inner(void* args) {
Args* args_ = (Args*)args;
args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe,
args_->gpu_tp_count, args_->gpu_experts_num,
args_->w13_weight_ptrs, args_->w13_scale_ptrs,
args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe, args_->gpu_tp_count,
args_->gpu_experts_num, args_->w13_weight_ptrs, args_->w13_scale_ptrs,
args_->w2_weight_ptrs, args_->w2_scale_ptrs);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe,
int gpu_tp_count, int gpu_experts_num,
py::list w13_weight_ptrs, py::list w13_scale_ptrs,
py::list w2_weight_ptrs, py::list w2_scale_ptrs) {
static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe, int gpu_tp_count,
int gpu_experts_num, py::list w13_weight_ptrs,
py::list w13_scale_ptrs, py::list w2_weight_ptrs,
py::list w2_scale_ptrs) {
// Convert Python lists to std::vector<uintptr_t>
std::vector<uintptr_t> w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec;
@@ -283,16 +281,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
for (auto item : w2_weight_ptrs) w2_weight_vec.push_back(py::cast<uintptr_t>(item));
for (auto item : w2_scale_ptrs) w2_scale_vec.push_back(py::cast<uintptr_t>(item));
Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
moe_cls.def("write_weight_scale_to_buffer_task", &WriteWeightScaleToBufferBindings::cpuinfer_interface,
py::arg("gpu_tp_count"), py::arg("gpu_experts_num"),
py::arg("w13_weight_ptrs"), py::arg("w13_scale_ptrs"),
py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
py::arg("gpu_tp_count"), py::arg("gpu_experts_num"), py::arg("w13_weight_ptrs"),
py::arg("w13_scale_ptrs"), py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
}
#endif
}

View File

@@ -19,41 +19,65 @@ BUILD_OPTIONS (for "build" or "all"):
--no-clean Do not delete local build/ before building (default cleans)
AUTO-DETECTION (Default):
The script will automatically detect your CPU capabilities and configure:
- If AMX instructions detected → NATIVE + AMX=ON
- Otherwise → NATIVE + AMX=OFF
The script will automatically detect your CPU and use ALL available features:
- CPUINFER_CPU_INSTRUCT = NATIVE (uses -march=native)
- CPUINFER_ENABLE_AMX = ON/OFF (based on detection)
- CPUINFER_ENABLE_AVX512_VNNI = ON/OFF (with fallback if OFF)
- CPUINFER_ENABLE_AVX512_BF16 = ON/OFF (with fallback if OFF)
✓ Best performance on YOUR machine
✗ Binary may NOT work on different/older CPUs
Use this when: Installing for local use only
MANUAL CONFIGURATION:
Use --manual flag and set these environment variables before running:
Use --manual flag when building for DISTRIBUTION or different machines.
Set these environment variables before running:
CPUINFER_CPU_INSTRUCT - CPU instruction set
Options: NATIVE, AVX512, AVX2, FANCY
CPUINFER_CPU_INSTRUCT - Target CPU instruction set
Options: AVX512, AVX2, FANCY, NATIVE
CPUINFER_ENABLE_AMX - Enable Intel AMX support
Options: ON, OFF
Manual configuration examples:
Distribution examples (portable binaries):
┌─────────────────────────────────────────────────────────────────────────┐
│ Configuration │ Use Case
├──────────────────────────────────┼──────────────────────────────────────┤
NATIVE + AMX=ON │ Best performance on AMX CPUs
AVX512 + AMX=OFF │ AVX512 CPUs without AMX
│ AVX2 + AMX=OFF │ Older CPUs or maximum compatibility │
└──────────────────────────────────┴──────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────────
│ Configuration │ Target CPUs │ Use Case │
├────────────────────────┼──────────────────────────┼──────────────────────┤
AVX512 + AMX=OFF │ Skylake-X, Ice Lake, │ General distribution
│ Cascade Lake, Zen 4 │ (recommended)
├────────────────────────┼──────────────────────────┼──────────────────────┤
│ AVX2 + AMX=OFF │ Haswell (2013) and newer │ Maximum compatibility│
├────────────────────────┼──────────────────────────┼──────────────────────┤
│ FANCY + AMX=OFF │ Ice Lake+, Zen 4+ │ Modern CPUs only │
│ │ (with full AVX512 ext) │ │
└────────────────────────┴──────────────────────────┴──────────────────────┘
Example manual build:
Use this when: Building Docker images, PyPI packages, or deploying to clusters
Example: Build for general distribution
export CPUINFER_CPU_INSTRUCT=AVX512
export CPUINFER_ENABLE_AMX=OFF
$0 build --manual
# Result: Works on any CPU with AVX512 (2017+)
Advanced option (for binary distribution):
FANCY - AVX512 with full extensions for Ice Lake+/Zen 4+
Use this when building pre-compiled binaries to distribute.
Example: Build for maximum compatibility
export CPUINFER_CPU_INSTRUCT=AVX2
export CPUINFER_ENABLE_AMX=OFF
$0 build --manual
# Result: Works on any CPU with AVX2 (2013+)
Optional variables (with defaults):
CPUINFER_BUILD_TYPE=Release Build type (Debug/RelWithDebInfo/Release)
CPUINFER_PARALLEL=8 Number of parallel build jobs
CPUINFER_VERBOSE=1 Verbose build output (0/1)
CPUINFER_BUILD_TYPE=Release Build type (Debug/RelWithDebInfo/Release)
CPUINFER_PARALLEL=8 Number of parallel build jobs
CPUINFER_VERBOSE=1 Verbose build output (0/1)
CPUINFER_ENABLE_AVX512_VNNI=ON/OFF Override VNNI detection (auto if unset)
CPUINFER_ENABLE_AVX512_BF16=ON/OFF Override BF16 detection (auto if unset)
Software Fallback Support:
✓ If VNNI not available: Uses AVX512BW fallback (2-3x slower but works)
✓ If BF16 not available: Uses AVX512F fallback (5-10x slower but works)
→ Old CPUs with only AVX512F+BW can run all code (slower but functional)
EOF
exit 1
@@ -120,20 +144,38 @@ install_dependencies() {
}
# Function to detect CPU features
# Returns: "has_amx has_avx512_vnni has_avx512_bf16" (space-separated 0/1 values)
detect_cpu_features() {
local has_amx=0
local has_avx512_vnni=0
local has_avx512_bf16=0
if [ -f /proc/cpuinfo ]; then
local cpu_flags
cpu_flags=$(grep -m1 "^flags" /proc/cpuinfo | tr ' ' '\n')
# Check for AMX support on Linux
if grep -q "amx_tile\|amx_int8\|amx_bf16" /proc/cpuinfo; then
if echo "$cpu_flags" | grep -qE "amx_tile|amx_int8|amx_bf16"; then
has_amx=1
fi
# Check for AVX512_VNNI support
if echo "$cpu_flags" | grep -qE "avx512_vnni|avx512vnni"; then
has_avx512_vnni=1
fi
# Check for AVX512_BF16 support
if echo "$cpu_flags" | grep -qE "avx512_bf16|avx512bf16"; then
has_avx512_bf16=1
fi
elif [ "$(uname)" = "Darwin" ]; then
# macOS doesn't have AMX (ARM or Intel without AMX)
has_amx=0
has_avx512_vnni=0
has_avx512_bf16=0
fi
echo "$has_amx"
echo "$has_amx $has_avx512_vnni $has_avx512_bf16"
}
build_step() {
@@ -161,34 +203,6 @@ build_step() {
echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
fi
# Check for multi-variant build mode (Docker environment)
if [ "${CPUINFER_BUILD_ALL_VARIANTS:-0}" = "1" ]; then
echo "=========================================="
echo "Building ALL CPU variants (AMX/AVX512/AVX2)"
echo "=========================================="
echo ""
echo "This will build three variants in a single wheel:"
echo " - AMX variant (Intel Sapphire Rapids+)"
echo " - AVX512 variant (Intel Skylake-X/Ice Lake+)"
echo " - AVX2 variant (maximum compatibility)"
echo ""
echo "Runtime CPU detection will automatically select the best variant."
echo ""
export CPUINFER_FORCE_REBUILD=1
export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
echo "Building with:"
echo " CPUINFER_BUILD_ALL_VARIANTS=1"
echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
echo " CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
echo ""
pip install . -v
return 0
fi
if [ "$MANUAL_MODE" = "0" ]; then
# Auto-detection mode
echo "=========================================="
@@ -196,25 +210,70 @@ build_step() {
echo "=========================================="
echo ""
HAS_AMX=$(detect_cpu_features)
# detect_cpu_features returns "has_amx has_avx512_vnni has_avx512_bf16"
CPU_FEATURES=$(detect_cpu_features)
HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
HAS_AVX512_VNNI=$(echo "$CPU_FEATURES" | cut -d' ' -f2)
HAS_AVX512_BF16=$(echo "$CPU_FEATURES" | cut -d' ' -f3)
export CPUINFER_CPU_INSTRUCT=NATIVE
if [ "$HAS_AMX" = "1" ]; then
echo "✓ AMX instructions detected"
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=ON
echo " Configuration: NATIVE + AMX=ON (best performance)"
echo ""
echo " ⚠️ Note: If you plan to use LLAMAFILE backend, use manual mode:"
echo " export CPUINFER_CPU_INSTRUCT=AVX512 # or AVX2/FANCY"
echo " export CPUINFER_ENABLE_AMX=OFF"
echo " ./install.sh build --manual"
echo "Configuration: NATIVE + AMX=ON"
echo " ✓ Best performance on this machine"
echo " ✗ Binary requires Sapphire Rapids or newer CPU"
else
echo " AMX instructions not detected"
export CPUINFER_CPU_INSTRUCT=NATIVE
export CPUINFER_ENABLE_AMX=OFF
echo " Configuration: NATIVE + AMX=OFF"
echo ""
echo "Configuration: NATIVE + AMX=OFF"
echo " ✓ Using AVX512/AVX2 instructions"
fi
echo ""
echo " ⚠️ IMPORTANT: This binary is optimized for THIS CPU only"
echo " To build portable binaries for distribution, use:"
echo " export CPUINFER_CPU_INSTRUCT=AVX512 # or AVX2"
echo " export CPUINFER_ENABLE_AMX=OFF"
echo " ./install.sh build --manual"
# Fine-grained AVX512 subset detection (with fallback support)
echo ""
echo "AVX512 Feature Detection:"
# VNNI: Check if user manually set it, otherwise auto-detect
if [ -n "${CPUINFER_ENABLE_AVX512_VNNI:-}" ]; then
echo " VNNI: User override = $CPUINFER_ENABLE_AVX512_VNNI"
else
if [ "$HAS_AVX512_VNNI" = "1" ]; then
echo " VNNI: ✓ Detected (hardware acceleration enabled)"
export CPUINFER_ENABLE_AVX512_VNNI=ON
else
echo " VNNI: ✗ Not detected (will use software fallback, 2-3x slower)"
export CPUINFER_ENABLE_AVX512_VNNI=OFF
fi
fi
# BF16: Check if user manually set it, otherwise auto-detect
if [ -n "${CPUINFER_ENABLE_AVX512_BF16:-}" ]; then
echo " BF16: User override = $CPUINFER_ENABLE_AVX512_BF16"
else
if [ "$HAS_AVX512_BF16" = "1" ]; then
echo " BF16: ✓ Detected (hardware acceleration enabled)"
export CPUINFER_ENABLE_AVX512_BF16=ON
else
echo " BF16: ✗ Not detected (will use software fallback, 5-10x slower)"
export CPUINFER_ENABLE_AVX512_BF16=OFF
fi
fi
echo ""
echo " Note: Software fallbacks ensure all code works on older CPUs"
echo " Tip: Override with CPUINFER_ENABLE_AVX512_VNNI/BF16=ON/OFF"
echo ""
echo "To use manual configuration instead, run: $0 build --manual"
echo ""
@@ -250,12 +309,32 @@ build_step() {
# Warn about problematic configuration
if [ "$CPUINFER_CPU_INSTRUCT" = "NATIVE" ] && [ "$CPUINFER_ENABLE_AMX" = "OFF" ]; then
HAS_AMX=$(detect_cpu_features)
CPU_FEATURES=$(detect_cpu_features)
HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
if [ "$HAS_AMX" = "1" ]; then
echo "⚠️ WARNING: NATIVE + AMX=OFF on AMX-capable CPU may cause compilation issues!"
echo " Recommended: Use AVX512 or AVX2 instead of NATIVE when AMX=OFF"
echo "=========================================="
echo "⚠️ WARNING: Risky Configuration"
echo "=========================================="
echo ""
read -p "Continue anyway? (y/N) " -n 1 -r
echo "Your configuration:"
echo " CPUINFER_CPU_INSTRUCT = NATIVE"
echo " CPUINFER_ENABLE_AMX = OFF"
echo ""
echo "Your CPU HAS AMX support!"
echo ""
echo "Problem:"
echo " • NATIVE uses -march=native which auto-enables ALL CPU features"
echo " • This may IGNORE your AMX=OFF setting"
echo " • The binary may still contain AMX instructions"
echo ""
echo "Recommended fixes:"
echo " 1) For portable build (recommended for distribution):"
echo " export CPUINFER_CPU_INSTRUCT=AVX512"
echo ""
echo " 2) If you want best performance on this CPU:"
echo " export CPUINFER_ENABLE_AMX=ON"
echo ""
read -p "Continue with risky configuration? (y/N) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
@@ -271,12 +350,15 @@ export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
export CPUINFER_VERBOSE=${CPUINFER_VERBOSE:-1}
echo "=========================================="
echo "Building kt-kernel with configuration:"
echo " CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
echo " CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
echo " CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
echo " CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
echo "=========================================="
echo " CPUINFER_CPU_INSTRUCT = $CPUINFER_CPU_INSTRUCT"
echo " CPUINFER_ENABLE_AMX = $CPUINFER_ENABLE_AMX"
echo " CPUINFER_ENABLE_AVX512_VNNI = ${CPUINFER_ENABLE_AVX512_VNNI:-AUTO}"
echo " CPUINFER_ENABLE_AVX512_BF16 = ${CPUINFER_ENABLE_AVX512_BF16:-AUTO}"
echo " CPUINFER_BUILD_TYPE = $CPUINFER_BUILD_TYPE"
echo " CPUINFER_PARALLEL = $CPUINFER_PARALLEL"
echo ""
pip install . -v

View File

@@ -94,10 +94,10 @@ class AMX_K2_MOE_TP {
}
#endif
inline void dump_buffer_b(const std::string &quantization_type, int expert_idx, const std::string &matrix_type,
typename T::BufferB *buffer) {
auto &quant_config = config_.quant_config;
int &group_size = quant_config.group_size;
inline void dump_buffer_b(const std::string& quantization_type, int expert_idx, const std::string& matrix_type,
typename T::BufferB* buffer) {
auto& quant_config = config_.quant_config;
int& group_size = quant_config.group_size;
printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
matrix_type.c_str());
@@ -110,7 +110,7 @@ class AMX_K2_MOE_TP {
cols = config_.hidden_size;
num_groups = cols / group_size;
scale_elem_count = num_groups * rows;
} else { // down
} else { // down
rows = config_.hidden_size;
cols = config_.intermediate_size;
num_groups = cols / group_size;
@@ -133,8 +133,8 @@ class AMX_K2_MOE_TP {
printf("\n");
}
// Dump quantized weights (as hex uint8)
size_t weight_size = (rows * cols) / 2; // INT4 packed
uint8_t *weight_ptr = (uint8_t *)buffer->b;
size_t weight_size = (rows * cols) / 2; // INT4 packed
uint8_t* weight_ptr = (uint8_t*)buffer->b;
printf(" Weights[first 32 bytes]: ");
for (int i = 0; i < std::min(32, (int)weight_size); i++) {
@@ -232,10 +232,12 @@ class AMX_K2_MOE_TP {
// (config_.expert_num * T::BufferA::M_STEP) in pool_count_ is to ensure padding for each experts.
pool_count_ = config_.max_len * config_.num_experts_per_tok + config_.expert_num * T::BufferA::M_STEP;
gate_up_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
gate_up_ba_pool_bytes_ =
(T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
gate_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
up_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
down_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
down_ba_pool_bytes_ =
(T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
down_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.hidden_size)) + pool_count_ * 64;
mem_requests.append_pointer(&gate_up_ba_pool_, gate_up_ba_pool_bytes_);
@@ -276,8 +278,7 @@ class AMX_K2_MOE_TP {
ith, nth);
// up part
up_bb_[expert_idx]->from_raw_mat(
(uint8_t*)config_.up_proj +
((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
(uint8_t*)config_.up_proj + ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
ith, nth);
},
nullptr);
@@ -302,19 +303,15 @@ class AMX_K2_MOE_TP {
[this, physical_to_logical_map](int task_id) {
uint64_t expert_idx = task_id;
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
size_t scale_elem_count =
(config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
size_t scale_elem_count = (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
// convert scales from BF16 to FP32
convert_or_copy(gate_bb_[expert_idx]->d,
(ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count),
scale_elem_count);
(ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
convert_or_copy(up_bb_[expert_idx]->d,
(ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count),
scale_elem_count);
(ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
convert_or_copy(down_bb_[expert_idx]->d,
(ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count),
scale_elem_count);
(ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
},
nullptr);
// dump_buffer_b("native", 0, "down", down_bb_[0].get());
@@ -323,10 +320,10 @@ class AMX_K2_MOE_TP {
// Reconstruct weights for all experts to the output buffers
// This function handles the TP-specific portion of the reconstruction for all experts
void write_weights_to_buffer(int gpu_tp_count, int cpu_tp_count, int num_experts, const GeneralMOEConfig& full_config,
const std::vector<uintptr_t>& w13_weight_ptrs,
const std::vector<uintptr_t>& w13_scale_ptrs,
const std::vector<uintptr_t>& w2_weight_ptrs,
const std::vector<uintptr_t>& w2_scale_ptrs) const {
const std::vector<uintptr_t>& w13_weight_ptrs,
const std::vector<uintptr_t>& w13_scale_ptrs,
const std::vector<uintptr_t>& w2_weight_ptrs,
const std::vector<uintptr_t>& w2_scale_ptrs) const {
const int group_size = config_.quant_config.group_size;
auto pool = config_.pool->get_subpool(tp_part_idx);
@@ -379,18 +376,19 @@ class AMX_K2_MOE_TP {
// Gate (first part of w13 for this expert)
uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b;
float* gate_scale_src = gate_bb_[expert_id]->d;
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight,
gate_weight_src, cpu_tp_weight_bytes);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale),
gate_scale_src, cpu_tp_scale_elem_count);
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight, gate_weight_src,
cpu_tp_weight_bytes);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale), gate_scale_src,
cpu_tp_scale_elem_count);
// Up (second part of w13 for this expert, immediately after gate)
uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b;
float* up_scale_src = up_bb_[expert_id]->d;
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight + gpu_tp_weight_bytes,
up_weight_src, cpu_tp_weight_bytes);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
up_scale_src, cpu_tp_scale_elem_count);
up_weight_src, cpu_tp_weight_bytes);
convert_or_copy(
(ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
up_scale_src, cpu_tp_scale_elem_count);
// Down (w2) - need to handle column-wise slicing
// The down matrix is transposed compared to gate/up, so we need to extract by columns
@@ -406,17 +404,16 @@ class AMX_K2_MOE_TP {
size_t gpu_col_slice_offset = local_idx * (config_.intermediate_size >> 1);
std::memcpy(w2_weight_dst + w2_expert_base_weight + gpu_col_offset + gpu_col_slice_offset,
(uint8_t*)down_bb_[expert_id]->b + cpu_col_offset,
config_.intermediate_size / 2);
(uint8_t*)down_bb_[expert_id]->b + cpu_col_offset, config_.intermediate_size / 2);
// Same for scales
size_t gpu_scale_col_offset = col * ((full_config.intermediate_size / gpu_tp_count) / group_size);
size_t cpu_scale_col_offset = col * (config_.intermediate_size / group_size);
size_t gpu_scale_slice_offset = local_idx * (config_.intermediate_size / group_size);
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
down_bb_[expert_id]->d + cpu_scale_col_offset,
config_.intermediate_size / group_size);
convert_or_copy(
(ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
down_bb_[expert_id]->d + cpu_scale_col_offset, config_.intermediate_size / group_size);
}
},
nullptr);
@@ -460,16 +457,15 @@ class AMX_K2_MOE_TP {
// Gate (first part of w13 for this expert)
uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b + cpu_offset_weight;
float* gate_scale_src = gate_bb_[expert_id]->d + cpu_offset_scale;
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight,
gate_weight_src, data_per_gpu_tp_weight);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale),
gate_scale_src, data_per_gpu_tp_scale);
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight, gate_weight_src, data_per_gpu_tp_weight);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale), gate_scale_src,
data_per_gpu_tp_scale);
// Up (second part of w13 for this expert, immediately after gate)
uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b + cpu_offset_weight;
float* up_scale_src = up_bb_[expert_id]->d + cpu_offset_scale;
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes,
up_weight_src, data_per_gpu_tp_weight);
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes, up_weight_src,
data_per_gpu_tp_weight);
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale + gpu_tp_scale_elem_count),
up_scale_src, data_per_gpu_tp_scale);
@@ -477,16 +473,20 @@ class AMX_K2_MOE_TP {
// The down matrix is transposed compared to gate/up, so we need to extract by columns
for (size_t col = 0; col < config_.hidden_size; col++) {
// Calculate the offset within the column for this GPU TP part
size_t col_offset_weight = (col * config_.intermediate_size / 2) + (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) + (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
size_t col_offset_weight = (col * config_.intermediate_size / 2) +
(local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) +
(local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
// Copy weights column by column
std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight + (col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight +
(col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
(uint8_t*)down_bb_[expert_id]->b + col_offset_weight,
(config_.intermediate_size / gpu_tps_per_cpu_tp) / 2);
// Copy scales column by column
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale + col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale +
col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
down_bb_[expert_id]->d + col_offset_scale,
(config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size);
}
@@ -587,8 +587,7 @@ class AMX_K2_MOE_TP {
m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
offset += m_local_num_[i];
if (m_local_num_[i] == 0)
continue;
if (m_local_num_[i] == 0) continue;
size_t max_m = (m_local_num_[i] + M_STEP - 1) / M_STEP * M_STEP;
gate_up_ba_[i]->max_m = max_m;
gate_up_ba_[i]->set_data(gate_up_ba_pool_ptr);
@@ -801,7 +800,8 @@ class AMX_K2_MOE_TP {
down_time, weight_time, forward_total_time, max_local_num, qlen);
#endif
// for (int i = 0; i < qlen; i ++)
// forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size, (float*)output + i * config_.hidden_size);
// forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size,
// (float*)output + i * config_.hidden_size);
}
void forward_decode(int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
@@ -826,7 +826,7 @@ class AMX_K2_MOE_TP {
m_expert_id_map_[activated_expert] = expert_ids[i];
activated_expert++;
}
size_t offset = 0;
for (int i = 0; i < activated_expert; i++) {
auto expert_idx = m_expert_id_map_[i];
@@ -912,9 +912,9 @@ class AMX_K2_MOE_TP {
amx::vec_mul_kgroup(qlen, config_.intermediate_size, config_.hidden_size, group_size, gate_up_ba_[0],
gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth);
gate_bc_[expert_idx]->to_mat(qlen, m_local_gate_output_ptr_[expert_idx], ith, nth);
}
},
nullptr);
}
},
nullptr);
#ifdef DEBUG_K2_MOE
if (activated_expert > 0) {
@@ -971,7 +971,6 @@ class AMX_K2_MOE_TP {
}
}
#ifdef FORWARD_TIME_PROFILE
{
auto now_time = std::chrono::high_resolution_clock::now();
@@ -1056,9 +1055,9 @@ class AMX_K2_MOE_TP {
}
__m512 weight = _mm512_set1_ps(weights[j]);
__m512 down_output0, down_output1;
avx512_32xbf16_to_32xfp32((__m512i*)(m_local_down_output_ptr_[expert_ids[j]] +
m_local_pos_[0][j] * config_.hidden_size + e),
&down_output0, &down_output1);
avx512_32xbf16_to_32xfp32(
(__m512i*)(m_local_down_output_ptr_[expert_ids[j]] + m_local_pos_[0][j] * config_.hidden_size + e),
&down_output0, &down_output1);
x0 = _mm512_fmadd_ps(down_output0, weight, x0);
x1 = _mm512_fmadd_ps(down_output1, weight, x1);
}
@@ -1151,28 +1150,26 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
// TP-slicing for gate and up (row-major slicing)
memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
src_gate + ((i * weight_elem_count) >> 1),
(weight_elem_count >> 1));
src_gate + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));
memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
src_up + ((i * weight_elem_count) >> 1),
(weight_elem_count >> 1));
src_up + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));
memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
src_gate_scale + (i * scales_elem_count),
sizeof(ggml_bf16_t) * scales_elem_count);
src_gate_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);
memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
src_up_scale + (i * scales_elem_count),
sizeof(ggml_bf16_t) * scales_elem_count);
src_up_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);
// TP-slicing for down (by column)
for (size_t col = 0; col < config.hidden_size; col++) {
memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
src_down + ((col * config.intermediate_size + i * tpc.intermediate_size) >> 1),
(tpc.intermediate_size >> 1));
memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
src_down_scale + (col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
memcpy((ggml_bf16_t*)tpc.down_scale +
(expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
src_down_scale +
(col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
}
},
@@ -1197,43 +1194,45 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
if (tps[i]->config_.load == false) {
pool->get_subpool(i)->do_work_stealing_job(
tpc.expert_num, nullptr,
[&](int expert_id_) { // weight and scale are all in col majored.
[&](int expert_id_) { // weight and scale are all in col majored.
size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
// weight and scale TP-slicing for gate and up
memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
(uint8_t*)config.gate_proj +
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
((sizeof(uint8_t) * weight_elem_count) >> 1));
(uint8_t*)config.gate_proj +
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
((sizeof(uint8_t) * weight_elem_count) >> 1));
memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
(uint8_t*)config.up_proj +
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
((sizeof(uint8_t) * weight_elem_count) >> 1));
(uint8_t*)config.up_proj +
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
((sizeof(uint8_t) * weight_elem_count) >> 1));
memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
(ggml_bf16_t*)config.gate_scale +
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
(ggml_bf16_t*)config.gate_scale +
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
i * scales_elem_count),
sizeof(ggml_bf16_t) * scales_elem_count);
sizeof(ggml_bf16_t) * scales_elem_count);
memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
(ggml_bf16_t*)config.up_scale +
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
(ggml_bf16_t*)config.up_scale +
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
i * scales_elem_count),
sizeof(ggml_bf16_t) * scales_elem_count);
sizeof(ggml_bf16_t) * scales_elem_count);
// weight and scale TP-slicing for down (by column)
for (size_t col = 0; col < config.hidden_size; col++) {
memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
(uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
(uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
col * config.intermediate_size + i * tpc.intermediate_size) >>
1),
(sizeof(uint8_t) * tpc.intermediate_size) >> 1);
memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
(ggml_bf16_t*)config.down_scale + ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
1),
(sizeof(uint8_t) * tpc.intermediate_size) >> 1);
memcpy((ggml_bf16_t*)tpc.down_scale +
(expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
(ggml_bf16_t*)config.down_scale +
((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
}
},
nullptr);
@@ -1245,7 +1244,8 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
#ifdef LOAD_TIME_PROFILE
{
auto load_now_time = std::chrono::high_resolution_clock::now();
alloc_and_tp_slice_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
alloc_and_tp_slice_time =
std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
load_last = load_now_time;
}
#endif
@@ -1277,9 +1277,11 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
cleanup_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
}
auto load_end_time = std::chrono::high_resolution_clock::now();
auto load_total_time = std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
auto load_total_time =
std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
printf(
"[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, total: %ld us\n",
"[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, "
"total: %ld us\n",
tp_count, alloc_and_tp_slice_time, tps_load_time, cleanup_time, load_total_time);
#endif
@@ -1307,15 +1309,13 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
auto& config = this->config;
auto pool = config.pool;
// Each TP part writes to its corresponding buffer
pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num,
w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num, w13_weight_ptrs, w13_scale_ptrs,
w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
// Note: w13 combines gate and up projections
// Split w13 pointers for gate and up
this->tps[numa_id]->write_weights_to_buffer(
gpu_tp_count, this->tp_count,
gpu_experts_num, this->config,
w13_weight_ptrs, w13_scale_ptrs, //gate + up use w13
w2_weight_ptrs, w2_scale_ptrs); // down uses w2
this->tps[numa_id]->write_weights_to_buffer(gpu_tp_count, this->tp_count, gpu_experts_num, this->config,
w13_weight_ptrs, w13_scale_ptrs, // gate + up use w13
w2_weight_ptrs, w2_scale_ptrs); // down uses w2
});
}

View File

@@ -350,10 +350,7 @@ struct BufferAKGroupImpl {
return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size);
}
BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
: max_m(max_m),
k(k),
k_group_size(k_group_size) {
BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : max_m(max_m), k(k), k_group_size(k_group_size) {
ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
ASSERT_RELEASE(max_m % M_STEP == 0, "max_m must be multiple of M_STEP");
ASSERT_RELEASE(k % K_STEP == 0, "k must be multiple of K_STEP");
@@ -459,8 +456,7 @@ struct BufferASmallKGroupImpl : public BufferAKGroupImpl<K> {
static constexpr int K_STEP = K::K_STEP;
static constexpr int K_BLOCK = K::K_BLOCK;
BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
: Base(max_m, k, k_group_size, ptr) {}
BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : Base(max_m, k, k_group_size, ptr) {}
// Override from_mat to write only 32 bytes per K_STEP iteration
void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
@@ -991,8 +987,8 @@ struct BufferBInt4WithZeroImpl {
template <typename K>
struct BufferBInt4KGroupImpl {
using dt = typename K::dt;
dt* b; // packed signed int4 weights, col majored
float* d; // scales only (no mins/zero-points), row majored
dt* b; // packed signed int4 weights, col majored
float* d; // scales only (no mins/zero-points), row majored
int n, k, k_group_size, k_group_count;
static constexpr int N_STEP = K::N_STEP;
@@ -1009,8 +1005,8 @@ struct BufferBInt4KGroupImpl {
assert(n % N_STEP == 0);
assert(k % K_STEP == 0);
if (n % N_STEP || k % K_STEP || k % k_group_size) {
printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP,
K_STEP, k_group_size);
printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, K_STEP,
k_group_size);
throw std::runtime_error("n or k is not aligned to N_STEP or K_STEP");
}
k_group_count = k / k_group_size;
@@ -1043,8 +1039,8 @@ struct BufferBInt4KGroupImpl {
// Get scale pointer for a specific row and k_group
float* get_scale(int n, int n_begin, int k, int k_begin) {
int k_group_idx = k_begin / k_group_size;
return d + n_begin * (k / k_group_size) + k_group_idx;
int k_group_idx = k_begin / k_group_size;
return d + n_begin * (k / k_group_size) + k_group_idx;
}
// Split range for parallel processing

View File

@@ -1552,9 +1552,9 @@ struct GemmKernel224Int4_1 {
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
for (int n_i = 0; n_i < 2; n_i++) {
__m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
__m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
}
}
}
@@ -2491,7 +2491,7 @@ struct GemmKernel224Int4_1KGroup {
__m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
for (int n_i = 0; n_i < 2; n_i++) {
__m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
}
}
}
@@ -2503,7 +2503,7 @@ struct GemmKernel224Int4_1KGroup {
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
for (int n_i = 0; n_i < 2; n_i++) {
__m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
}
}
}
@@ -2767,7 +2767,7 @@ struct GemmKernel224Int4_1_LowKGroup {
__m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
for (int n_i = 0; n_i < 2; n_i++) {
__m512i b512_lo = _mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
}
}
}
@@ -2779,7 +2779,7 @@ struct GemmKernel224Int4_1_LowKGroup {
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
for (int n_i = 0; n_i < 2; n_i++) {
__m512i b512_hi = _mm512_srli_epi32(_mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]), 4);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
}
}
}
@@ -2850,7 +2850,7 @@ struct GemmKernel224Int4SmallKGroup {
using output_t = int32_t;
static constexpr double ELEMENT_SIZE = 0.5;
static const int VNNI_BLK = 4;
static const int M_STEP = 1;
static const int N_STEP = 32;
static const int K_STEP = 32;
@@ -2870,18 +2870,15 @@ struct GemmKernel224Int4SmallKGroup {
alignas(64) static constexpr uint8_t hi_mask_arr[32] = {
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0
};
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};
alignas(64) static constexpr uint8_t lo_mask_arr[32] = {
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
};
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};
alignas(64) static constexpr uint8_t sign_xor_arr[32] = {
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88
};
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88};
static __m256i hi_mask() { return *((__m256i*)(&hi_mask_arr[0])); }
static __m256i lo_mask() { return *((__m256i*)(&lo_mask_arr[0])); }
static __m256i sign_xor_mask() { return *((__m256i*)(&sign_xor_arr[0])); }
@@ -2902,27 +2899,28 @@ struct GemmKernel224Int4SmallKGroup {
const __m512i lane_shuffle = _mm512_set_epi64(7, 6, 3, 2, 5, 4, 1, 0);
return _mm512_permutexvar_epi64(lane_shuffle, result);
}
static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB *bb, BufferC* bc, int ith, int nth) {
static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB* bb,
BufferC* bc, int ith, int nth) {
auto [n_start, n_end] = split_range_n(n, ith, nth);
for (int m_begin = 0; m_begin < m; m_begin ++) {
for (int m_begin = 0; m_begin < m; m_begin++) {
float* c = bc->get_submat(m, n, m_begin, n_start);
__m512i* a512 = (__m512i*)ba->get_submat(m, k, m_begin, 0);
for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin ++) {
for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin++) {
__m256i* b256 = (__m256i*)bb->get_submat(n, k, n_block_begin, 0);
float* as = (float*)ba->get_scale(m, m_begin, k, 0);
float* bs = (float*)bb->get_scale(n, n_block_begin, k, 0);
__m512 sum = _mm512_setzero_ps();
#define WORK_K_BLOCK(k_block) \
{ \
__m256 abscale0 = _mm256_set1_ps(as[(k_block)*2] * bs[(k_block)*2]); \
__m256 abscale1 = _mm256_set1_ps(as[(k_block)*2+1] * bs[(k_block)*2+1]); \
__m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
__m512i mul = _mm512_setzero_si512(); \
mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
}
#define WORK_K_BLOCK(k_block) \
{ \
__m256 abscale0 = _mm256_set1_ps(as[(k_block) * 2] * bs[(k_block) * 2]); \
__m256 abscale1 = _mm256_set1_ps(as[(k_block) * 2 + 1] * bs[(k_block) * 2 + 1]); \
__m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
__m512i mul = _mm512_setzero_si512(); \
mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
}
for (int k_block = 0; k_block < k / 64; k_block += 2) {
WORK_K_BLOCK(k_block);
@@ -2935,13 +2933,15 @@ struct GemmKernel224Int4SmallKGroup {
}
};
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
}
inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);

View File

@@ -211,26 +211,44 @@ inline __m256i merge_q8K_bsum(block_q8_K* b) {
return _mm256_madd_epi16(_mm256_loadu_si256((__m256i*)b->bsums), _mm256_set1_epi16(1));
}
inline __m512i _mm512_dpbusd_epi32_compat(__m512i src, __m512i a, __m512i b) {
#if defined(__AVX512VNNI__)
return _mm512_dpbusd_epi32(src, a, b);
#else
const __m512i mask_lo = _mm512_set1_epi16(0x00FF);
const __m512i ones16 = _mm512_set1_epi16(1);
__m512i a_even = _mm512_and_si512(a, mask_lo);
__m512i b_even = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);
__m512i a_odd = _mm512_srli_epi16(a, 8);
__m512i b_odd = _mm512_srai_epi16(b, 8);
__m512i prod_even = _mm512_mullo_epi16(a_even, b_even);
__m512i prod_odd = _mm512_mullo_epi16(a_odd, b_odd);
__m512i sum_even = _mm512_madd_epi16(prod_even, ones16);
__m512i sum_odd = _mm512_madd_epi16(prod_odd, ones16);
return _mm512_add_epi32(src, _mm512_add_epi32(sum_even, sum_odd));
#endif
}
inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
// 提取高低256-bit部分
__m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
__m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
__m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
__m256i b_hi = _mm512_extracti64x4_epi64(b, 1);
// 根据a的符号调整b的符号
b_lo = _mm256_sign_epi8(b_lo, a_lo);
b_hi = _mm256_sign_epi8(b_hi, a_hi);
// 将修改后的b重新组合
b = _mm512_inserti64x4(b, b_lo, 0);
b = _mm512_inserti64x4(b, b_hi, 1);
// 取绝对值
a = _mm512_abs_epi8(a);
// 进行dot-product计算
return _mm512_dpbusd_epi32(src, a, b);
return _mm512_dpbusd_epi32_compat(src, a, b);
}
} // namespace amx

View File

@@ -9,10 +9,42 @@ static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
_mm512_storeu_si512(dst, _mm512_loadu_si512(src));
}
// FP32 to BF16 conversion (32 floats -> 32 bf16)
// This requires AVX512BF16 for the fast path, with a fallback for CPUs without it
static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
#if defined(HAVE_AVX512BF16) || defined(__AVX512BF16__)
// Fast path: use native AVX512BF16 instruction
_mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
#else
// Fallback: manual BF16 conversion using bit manipulation
// BF16 is the upper 16 bits of FP32 (with rounding)
__m512i i0 = _mm512_castps_si512(*src0);
__m512i i1 = _mm512_castps_si512(*src1);
// Round to nearest even: add 0x7FFF + ((val >> 16) & 1)
__m512i round0 =
_mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i0, 16), _mm512_set1_epi32(1)));
__m512i round1 =
_mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i1, 16), _mm512_set1_epi32(1)));
i0 = _mm512_add_epi32(i0, round0);
i1 = _mm512_add_epi32(i1, round1);
// Extract upper 16 bits (BF16)
i0 = _mm512_srli_epi32(i0, 16);
i1 = _mm512_srli_epi32(i1, 16);
// Pack 32-bit values to 16-bit
__m512i result = _mm512_packus_epi32(i0, i1);
// Fix the interleaving from packus
result = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), result);
_mm512_storeu_si512(dst, result);
#endif
}
// BF16 to FP32 conversion (32 bf16 -> 32 floats)
// This does NOT require AVX512BF16 - uses basic AVX512 bit manipulation
static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
_mm512_storeu_ps(dst0, _mm512_castsi512_ps(
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)(src))), 16)));

View File

@@ -194,7 +194,7 @@ class CMakeBuild(build_ext):
info["raw"]["flags"] = flags
# feature summary
if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl", "avx512vnni"]):
if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl"]):
info["features"].add("AVX512")
if "avx2" in flags or "avx2" in low:
info["features"].add("AVX2")
@@ -205,6 +205,16 @@ class CMakeBuild(build_ext):
):
info["features"].add("AMX")
# Fine-grained AVX512 subset detection
if any(f in flags for f in ["avx512_vnni", "avx512vnni"]):
info["features"].add("AVX512_VNNI")
if any(f in flags for f in ["avx512_bf16", "avx512bf16"]):
info["features"].add("AVX512_BF16")
if any(f in flags for f in ["avx512_vbmi", "avx512vbmi"]):
info["features"].add("AVX512_VBMI")
if any(f in flags for f in ["avx512_vpopcntdq", "avx512vpopcntdq"]):
info["features"].add("AVX512_VPOPCNTDQ")
elif sysname == "Darwin":
# macOS: Apple Silicon (arm64) vs Intel
arch = platform.machine().lower()
@@ -229,133 +239,6 @@ class CMakeBuild(build_ext):
return info
def build_extension(self, ext: CMakeExtension):
"""
Main entry point for building the extension.
Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
and routes to the appropriate build method.
"""
if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
# Build all 3 variants (AMX, AVX512, AVX2)
self.build_multi_variants(ext)
else:
# Build single variant (original behavior)
self._build_single_variant(ext)
def build_multi_variants(self, ext: CMakeExtension):
"""
Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel.
This method is called when CPUINFER_BUILD_ALL_VARIANTS=1 is set.
It builds three separate extensions with different CPU instruction sets
and renames the output .so files with variant suffixes.
"""
print("=" * 80)
print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)")
print("=" * 80)
# Define the 3 variants to build
variants = [
{
'name': 'amx',
'env': {
'CPUINFER_CPU_INSTRUCT': 'NATIVE',
'CPUINFER_ENABLE_AMX': 'ON',
},
'description': 'AMX variant (Intel Sapphire Rapids+)'
},
{
'name': 'avx512',
'env': {
'CPUINFER_CPU_INSTRUCT': 'AVX512',
'CPUINFER_ENABLE_AMX': 'OFF',
},
'description': 'AVX512 variant (Intel Skylake-X/Ice Lake/Cascade Lake)'
},
{
'name': 'avx2',
'env': {
'CPUINFER_CPU_INSTRUCT': 'AVX2',
'CPUINFER_ENABLE_AMX': 'OFF',
},
'description': 'AVX2 variant (maximum compatibility)'
}
]
# Save original environment
original_env = os.environ.copy()
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
for i, variant in enumerate(variants, 1):
print(f"\n{'=' * 80}")
print(f"Building variant {i}/3: {variant['description']}")
print(f"{'=' * 80}\n")
# Set variant-specific environment variables
os.environ.update(variant['env'])
# Use a unique build directory for this variant
original_build_temp = self.build_temp
self.build_temp = str(Path(self.build_temp) / f"variant_{variant['name']}")
try:
# Build this variant (calls the single-variant build logic)
self._build_single_variant(ext)
# Rename the generated .so file to include variant suffix
# Original: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
# Renamed: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
# Extract the base extension name (without package prefix)
# ext.name is "kt_kernel.kt_kernel_ext", we want "kt_kernel_ext"
base_ext_name = ext.name.split('.')[-1]
# Find the newly built .so file
import time
time.sleep(0.5) # Give filesystem time to sync
built_candidates = [
f for f in Path(extdir).glob("*.so")
if f.name.startswith(base_ext_name) and not f.name.startswith(f"_{base_ext_name}_")
]
if not built_candidates:
print(f"WARNING: No .so file found for {base_ext_name} in {extdir}")
print(f"Files in {extdir}:")
for f in Path(extdir).glob("*.so"):
print(f" {f.name}")
for so_file in built_candidates:
# Extract the python tag part (e.g., ".cpython-311-x86_64-linux-gnu.so")
suffix = so_file.name.replace(base_ext_name, "")
new_name = f"_{base_ext_name}_{variant['name']}{suffix}"
new_path = extdir / new_name
print(f"-- Renaming {so_file.name} -> {new_name}")
if new_path.exists():
print(f" WARNING: Target file already exists, removing: {new_path}")
new_path.unlink()
so_file.rename(new_path)
print(f" ✓ Successfully renamed to {new_name}")
finally:
# Restore build_temp for next iteration
self.build_temp = original_build_temp
# Restore original environment
os.environ.clear()
os.environ.update(original_env)
print(f"\n{'=' * 80}")
print("✓ Successfully built all 3 CPU variants")
print(f"{'=' * 80}\n")
def _build_single_variant(self, ext: CMakeExtension):
"""
Build a single CPU variant. This contains the core build logic
extracted from the original build_extension method.
"""
# Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
def detect_cuda_toolkit() -> bool:
# Respect CUDA_HOME
@@ -403,10 +286,6 @@ class CMakeBuild(build_ext):
auto_cuda = detect_cuda_toolkit()
os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
elif cuda_env:
print("-- CPUINFER_USE_CUDA explicitly enabled")
else:
print("-- CPUINFER_USE_CUDA explicitly disabled")
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
cfg = default_build_type()
@@ -461,6 +340,17 @@ class CMakeBuild(build_ext):
else:
print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella")
# Fine-grained AVX512 subset flags: only enable if CPU actually supports them
# These are passed to CMake to conditionally add compiler flags
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VNNI", "LLAMA_AVX512_VNNI"):
if "AVX512_VNNI" in d["features"]:
cmake_args.append("-DLLAMA_AVX512_VNNI=ON")
print("-- AVX512_VNNI detected; enabling (-DLLAMA_AVX512_VNNI=ON)")
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_BF16", "LLAMA_AVX512_BF16"):
if "AVX512_BF16" in d["features"]:
cmake_args.append("-DLLAMA_AVX512_BF16=ON")
print("-- AVX512_BF16 detected; enabling (-DLLAMA_AVX512_BF16=ON)")
# Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
# (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None)
@@ -562,6 +452,7 @@ class CMakeBuild(build_ext):
# Version (simple). If you later add a python package dir, you can read from it.
################################################################################
# Import version from shared version.py at project root
_version_file = Path(__file__).resolve().parent.parent / "version.py"
if _version_file.exists():