mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 14:29:22 +00:00
Fix CPU Instruction Set and Installation (#1729)
* [fix](kt-kernel): fix AVX512 cpu instruction set detection * [feat](kt-kernel): AVX512 fallback kernel for RAW-INT4 * [fix](kt-kernel): fix setup version issue * [fix](kt-kernel): update install for custom build * [docs](kt-kernel): new installation guide for various cpu instruction set * [fix](kt-kernel): fix _mm512_dpbusd_epi32_compat fallback implmentation * [style](kt-kernel): clang format
This commit is contained in:
@@ -28,7 +28,7 @@ option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF
|
||||
# LTO control
|
||||
option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)
|
||||
|
||||
project(kt_kernel_ext VERSION 0.4.2)
|
||||
project(kt_kernel_ext VERSION 0.4.4)
|
||||
# Choose compilers BEFORE project() so CMake honors them
|
||||
if(USE_CONDA_TOOLCHAIN)
|
||||
if(NOT DEFINED ENV{CONDA_PREFIX} OR NOT EXISTS "$ENV{CONDA_PREFIX}")
|
||||
@@ -378,7 +378,20 @@ if(HOST_IS_X86)
|
||||
target_link_libraries(${test_name} llama OpenMP::OpenMP_CXX numa)
|
||||
endforeach()
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS -mfma -mf16c -mavx512bf16 -mavx512vnni)
|
||||
# Note: AVX512 subset flags (-mavx512vnni, -mavx512bf16) are already added
|
||||
# in the generic x86 detection block above (lines 276-289) when corresponding
|
||||
# LLAMA_AVX512_* options are enabled. No need to add them again here.
|
||||
# -mfma is already added by LLAMA_NATIVE (line 254), LLAMA_AVX*, or LLAMA_FMA blocks.
|
||||
# Only add -mf16c if LLAMA_F16C is not already enabled.
|
||||
if(NOT LLAMA_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if(LLAMA_AVX512_VNNI)
|
||||
message(STATUS "AVX512_VNNI enabled")
|
||||
endif()
|
||||
if(LLAMA_AVX512_BF16)
|
||||
message(STATUS "AVX512_BF16 enabled")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
|
||||
- ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
|
||||
- ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
|
||||
- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
|
||||
- ✅ **Kimi-K2 Native INT4 (RAWINT4)**: Supported on AVX512 CPUs (CPU-GPU shared INT4 weights) - [Guide](../doc/en/Kimi-K2-Thinking-Native.md)
|
||||
|
||||
## Features
|
||||
|
||||
@@ -49,6 +50,8 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
|
||||
|
||||
### Option 1: Install from PyPI (Recommended for Most Users)
|
||||
|
||||
Coming soon...
|
||||
|
||||
Choose the version matching your CUDA installation:
|
||||
|
||||
```bash
|
||||
@@ -104,76 +107,55 @@ python -c "import kt_kernel"
|
||||
|
||||
---
|
||||
|
||||
### Option 2: Install from Source (For AMD, ARM, or Custom Builds)
|
||||
### Option 2: Install from Source (For Local Use or Custom Builds)
|
||||
|
||||
If you need AMD (BLIS), ARM (KML), or custom CUDA versions, build from source:
|
||||
Build from source for local installation or when you need AMD (BLIS), ARM (KML), or custom CUDA versions.
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
First, initialize git submodules:
|
||||
First, initialize git submodules and create a conda environment:
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Quick Installation
|
||||
|
||||
Step 0: Create and activate a conda environment (recommended):
|
||||
|
||||
```bash
|
||||
conda create -n kt-kernel python=3.11 -y
|
||||
conda activate kt-kernel
|
||||
```
|
||||
|
||||
You can now install in two clear steps using the same script.
|
||||
#### Quick Installation (Recommended)
|
||||
|
||||
**Option A: Two-step** (specify dependencies installation and build separately)
|
||||
|
||||
```bash
|
||||
# 1) Install system prerequisites (cmake, hwloc, pkg-config)
|
||||
./install.sh deps
|
||||
|
||||
# 2) Build and install kt-kernel (auto-detects CPU instruction set)
|
||||
# By default, the script cleans the local ./build directory before compiling
|
||||
./install.sh build
|
||||
```
|
||||
|
||||
**Option B: One-step**
|
||||
Simply run the install script - it will auto-detect your CPU and optimize for best performance:
|
||||
|
||||
```bash
|
||||
./install.sh
|
||||
```
|
||||
|
||||
The install script will:
|
||||
- Auto-detect CPU capabilities (AMX support)
|
||||
- Install `cmake` via conda (if available)
|
||||
- Install system dependencies (`libhwloc-dev`, `pkg-config`) based on your OS
|
||||
|
||||
**What gets configured automatically:**
|
||||
- AMX CPU detected → `NATIVE + AMX=ON`
|
||||
- No AMX detected → `NATIVE + AMX=OFF`
|
||||
|
||||
⚠️ **Important for LLAMAFILE backend users:**
|
||||
If you have an AMX-capable CPU but plan to use the LLAMAFILE backend, do NOT use the default auto-detection build.
|
||||
Use "manual mode" with `CPUINFER_CPU_INSTRUCT` set to `AVX512` or `AVX2` instead of `NATIVE` to avoid compilation issues (see below).
|
||||
|
||||
⚠️ **Important for BLIS AMD backend users:**
|
||||
for the installation guide, see this [issue](https://github.com/kvcache-ai/ktransformers/issues/1601)
|
||||
|
||||
|
||||
### Manual Configuration (Advanced)
|
||||
|
||||
If you need specific build options (e.g., for LLAMAFILE backend, compatibility, or binary distribution):
|
||||
**What happens automatically:**
|
||||
- Auto-detects CPU capabilities (AMX, AVX512_VNNI, AVX512_BF16)
|
||||
- Installs system dependencies (`cmake`, `libhwloc-dev`, `pkg-config`)
|
||||
- Builds optimized binary for **your CPU only** (using `-march=native`)
|
||||
- **Software fallbacks**: Automatically enabled for CPUs without VNNI/BF16
|
||||
|
||||
**Optional: Two-step installation**
|
||||
```bash
|
||||
# Example for LLAMAFILE backend on AMX CPU with AVX512
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512 # Options: NATIVE, AVX512, AVX2, FANCY
|
||||
export CPUINFER_ENABLE_AMX=OFF # Options: ON, OFF
|
||||
|
||||
# Build only (skip auto-detection of instruction set)
|
||||
./install.sh build --manual
|
||||
./install.sh deps # Install dependencies only
|
||||
./install.sh build # Build and install kt-kernel
|
||||
```
|
||||
|
||||
For advanced build options and binary distribution, see the [Build Configuration](#build-configuration) section. If you encounter issues, refer to [Error Troubleshooting](#error-troubleshooting).
|
||||
**CPU Requirements by Backend:**
|
||||
|
||||
| Backend | Minimum CPU Requirement | Example CPUs | Notes |
|
||||
|---------|-------------------------|--------------|-------|
|
||||
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+), AMD Zen+ | Universal compatibility |
|
||||
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+), Ice Lake, Cascade Lake | Software fallbacks for VNNI/BF16 |
|
||||
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | Best performance, requires AMX hardware |
|
||||
|
||||
**Software Fallback Support (AVX512 backends):**
|
||||
- ✅ VNNI fallback: Uses AVX512BW instructions
|
||||
- ✅ BF16 fallback: Uses AVX512F instructions
|
||||
- ✅ Older AVX512 CPUs (Skylake-X, Cascade Lake) can run RAWINT4 with fallbacks
|
||||
|
||||
⚠️ **Portability Note:** The default build is optimized for your specific CPU and may not work on different/older CPUs. For portable builds or binary distribution, see [Manual Configuration](#manual-configuration-advanced) below.
|
||||
|
||||
⚠️ **AMD BLIS backend users:** See [installation guide](https://github.com/kvcache-ai/ktransformers/issues/1601) for AMD-specific setup.
|
||||
|
||||
## Verification
|
||||
|
||||
@@ -482,11 +464,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
|
||||
KTMoEWrapper.clear_buffer_cache()
|
||||
```
|
||||
|
||||
### Manual Configuration (Advanced)
|
||||
|
||||
For portable builds, binary distribution, or cross-machine deployment, you need to manually specify target instruction sets:
|
||||
|
||||
```bash
|
||||
# General distribution (works on any AVX512 CPU from 2017+)
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
|
||||
# Maximum compatibility (works on any CPU from 2013+)
|
||||
export CPUINFER_CPU_INSTRUCT=AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
|
||||
# Modern CPUs only (Ice Lake+, Zen 4+)
|
||||
export CPUINFER_CPU_INSTRUCT=FANCY
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
```
|
||||
|
||||
**Optional: Override VNNI/BF16 detection**
|
||||
```bash
|
||||
# Force enable/disable VNNI and BF16 (for testing fallbacks)
|
||||
export CPUINFER_ENABLE_AVX512_VNNI=OFF
|
||||
export CPUINFER_ENABLE_AVX512_BF16=OFF
|
||||
./install.sh
|
||||
```
|
||||
|
||||
See `./install.sh --help` for all available options.
|
||||
|
||||
---
|
||||
|
||||
## Build Configuration
|
||||
|
||||
### Manual Installation
|
||||
### Manual Installation (Without install.sh)
|
||||
|
||||
If you prefer manual installation without the `install.sh` script, follow these steps:
|
||||
If you prefer manual installation without the `install.sh` script:
|
||||
|
||||
#### 1. Install System Dependencies
|
||||
|
||||
@@ -508,27 +523,29 @@ If you prefer manual installation without the `install.sh` script, follow these
|
||||
|
||||
**Instruction Set Details:**
|
||||
|
||||
- **`NATIVE`**: Auto-detect and use all available CPU instructions (`-march=native`) - **Recommended for best performance**
|
||||
- **`AVX512`**: Explicit AVX512 support for Skylake-SP and Cascade Lake
|
||||
- **`AVX2`**: AVX2 support for maximum compatibility
|
||||
- **`FANCY`**: AVX512 with full extensions (AVX512F/BW/DQ/VL/VNNI) for Ice Lake+ and Zen 4+. Use this when building pre-compiled binaries to distribute to users with modern CPUs. For local builds, prefer `NATIVE` for better performance.
|
||||
| Option | Target CPUs | Use Case |
|
||||
|--------|-------------|----------|
|
||||
| **`NATIVE`** | Your specific CPU only | Local builds (best performance, **default**) |
|
||||
| **`AVX512`** | Skylake-X, Ice Lake, Cascade Lake, Zen 4+ | General distribution |
|
||||
| **`AVX2`** | Haswell (2013) and newer | Maximum compatibility |
|
||||
| **`FANCY`** | Ice Lake+, Zen 4+ | Modern CPUs with full AVX512 extensions |
|
||||
|
||||
**Example Configurations:**
|
||||
|
||||
```bash
|
||||
# Maximum performance on AMX CPU
|
||||
# Local use - maximum performance (default behavior)
|
||||
export CPUINFER_CPU_INSTRUCT=NATIVE
|
||||
export CPUINFER_ENABLE_AMX=ON
|
||||
export CPUINFER_ENABLE_AMX=ON # or OFF
|
||||
|
||||
# AVX512 CPU without AMX
|
||||
# Distribution build - works on any AVX512 CPU
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
|
||||
# Compatibility build
|
||||
# Maximum compatibility - works on CPUs since 2013
|
||||
export CPUINFER_CPU_INSTRUCT=AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
|
||||
# Debug build for development
|
||||
# Debug build
|
||||
export CPUINFER_BUILD_TYPE=Debug
|
||||
export CPUINFER_VERBOSE=1
|
||||
```
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
- ✅ **带 AMX 的 Intel CPU**:已支持(基于转换为 INT4/INT8 格式的权重)
|
||||
- ✅ **通用 CPU(llamafile 后端)**:已支持(基于 GGUF 格式的权重)
|
||||
- ✅ **带 BLIS 的 AMD CPU**:已支持(int8 的 prefill 和 decode)
|
||||
- ✅ **Kimi-K2 原生 INT4(RAWINT4)**:支持 AVX512 CPU(CPU-GPU 共享 INT4 权重)- [使用指南](../doc/en/Kimi-K2-Thinking-Native.md)
|
||||
|
||||
## 特性
|
||||
|
||||
@@ -49,69 +50,55 @@
|
||||
|
||||
## 安装
|
||||
|
||||
### 先决条件
|
||||
### 从源码安装(本机使用或自定义构建)
|
||||
|
||||
首先初始化子模块:
|
||||
适用于本地安装,或需要 AMD (BLIS)、ARM (KML) 或自定义 CUDA 版本的场景。
|
||||
|
||||
#### 先决条件
|
||||
|
||||
首先初始化子模块并创建 conda 环境:
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
### 快速安装(推荐)
|
||||
|
||||
第 0 步:创建并激活一个 conda 环境(推荐):
|
||||
|
||||
```bash
|
||||
conda create -n kt-kernel python=3.11 -y
|
||||
conda activate kt-kernel
|
||||
```
|
||||
|
||||
随后可以用同一个脚本分两步或一步安装。
|
||||
#### 快速安装(推荐)
|
||||
|
||||
方案 A:两步(可以指定依赖安装与编译构建)
|
||||
|
||||
```bash
|
||||
# 1)安装系统依赖(cmake, hwloc, pkg-config)
|
||||
./install.sh deps
|
||||
|
||||
# 2)构建并安装 kt-kernel(自动检测 CPU 指令集)
|
||||
# 默认会在编译前清理本地 ./build 目录
|
||||
./install.sh build
|
||||
```
|
||||
|
||||
方案 B:一步
|
||||
只需运行安装脚本,它会自动检测 CPU 并优化性能:
|
||||
|
||||
```bash
|
||||
./install.sh
|
||||
```
|
||||
|
||||
安装脚本会:
|
||||
- 自动检测 CPU 能力(是否支持 AMX)
|
||||
- 尝试通过 conda 安装 `cmake`(若可用)
|
||||
- 根据你的操作系统安装系统依赖(`libhwloc-dev`、`pkg-config`)
|
||||
|
||||
**自动配置内容:**
|
||||
- 检测到 AMX CPU → 使用 `NATIVE + AMX=ON`
|
||||
- 未检测到 AMX → 使用 `NATIVE + AMX=OFF`
|
||||
|
||||
⚠️ **LLAMAFILE 后端用户特别说明:**
|
||||
如果你有带 AMX 的 CPU,但是计划使用 LLAMAFILE 后端,请不要使用默认的自动检测构建方式。
|
||||
请使用“手动模式”,并将 `CPUINFER_CPU_INSTRUCT` 设为 `AVX512` 或 `AVX2` 而非 `NATIVE`,以避免编译期异常(见下文)。
|
||||
|
||||
### 手动配置(进阶)
|
||||
|
||||
如果你需要更精细的构建选项(例如为 LLAMAFILE 后端、兼容性或二进制分发配置):
|
||||
**自动完成的操作:**
|
||||
- 自动检测 CPU 能力(AMX、AVX512_VNNI、AVX512_BF16)
|
||||
- 安装系统依赖(`cmake`、`libhwloc-dev`、`pkg-config`)
|
||||
- 为**你的 CPU** 构建优化二进制(使用 `-march=native`)
|
||||
- **软件回退机制**:为不支持 VNNI/BF16 的 CPU 自动启用
|
||||
|
||||
**可选:分步安装**
|
||||
```bash
|
||||
# 在带 AMX 的 CPU 上构建 LLAMAFILE 后端的示例(使用 AVX512)
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512 # 选项: NATIVE, AVX512, AVX2, FANCY
|
||||
export CPUINFER_ENABLE_AMX=OFF # 选项: ON, OFF
|
||||
|
||||
# 仅构建(不进行指令集的自动检测)
|
||||
./install.sh build --manual
|
||||
./install.sh deps # 仅安装依赖
|
||||
./install.sh build # 构建并安装 kt-kernel
|
||||
```
|
||||
|
||||
更多构建选项和二进制分发配置,请参见 [构建配置](#构建配置) 一节。
|
||||
如果遇到问题,可参考 [错误排查](#错误排查)。
|
||||
**不同后端的 CPU 要求:**
|
||||
|
||||
| 后端 | 最低 CPU 要求 | 示例 CPU | 说明 |
|
||||
|------|---------------|----------|------|
|
||||
| **LLAMAFILE** | AVX2 | Intel Haswell (2013+)、AMD Zen+ | 通用兼容性 |
|
||||
| **RAWINT4** | AVX512F + AVX512BW | Intel Skylake-X (2017+)、Ice Lake、Cascade Lake | 支持 VNNI/BF16 软件回退 |
|
||||
| **AMXINT4/INT8** | AMX | Intel Sapphire Rapids (2023+) | 最佳性能,需要 AMX 硬件 |
|
||||
|
||||
**软件回退支持(AVX512 后端):**
|
||||
- ✅ VNNI 回退:使用 AVX512BW 指令
|
||||
- ✅ BF16 回退:使用 AVX512F 指令
|
||||
- ✅ 老的 AVX512 CPU(Skylake-X、Cascade Lake)可以运行 RAWINT4(使用回退)
|
||||
|
||||
⚠️ **可移植性说明:** 默认构建针对你的特定 CPU 优化,可能无法在不同/更老的 CPU 上运行。如需打包分发或跨机器部署,请参见下方的 [手动配置](#手动配置进阶)。
|
||||
|
||||
⚠️ **AMD BLIS 后端用户:** 请参见 [安装指南](https://github.com/kvcache-ai/ktransformers/issues/1601) 了解 AMD 专用配置。
|
||||
|
||||
## 验证安装
|
||||
|
||||
@@ -421,11 +408,44 @@ batch_sizes = KTMoEWrapper.get_capture_batch_sizes()
|
||||
KTMoEWrapper.clear_buffer_cache()
|
||||
```
|
||||
|
||||
### 手动配置(进阶)
|
||||
|
||||
如需打包分发、跨机器部署或构建可移植二进制,需要手动指定目标指令集:
|
||||
|
||||
```bash
|
||||
# 通用分发版(适用于 2017+ 的任何 AVX512 CPU)
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
|
||||
# 最大兼容性(适用于 2013+ 的任何 CPU)
|
||||
export CPUINFER_CPU_INSTRUCT=AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
|
||||
# 仅限现代 CPU(Ice Lake+、Zen 4+)
|
||||
export CPUINFER_CPU_INSTRUCT=FANCY
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
./install.sh build --manual
|
||||
```
|
||||
|
||||
**可选:覆盖 VNNI/BF16 检测**
|
||||
```bash
|
||||
# 强制启用/禁用 VNNI 和 BF16(用于测试回退)
|
||||
export CPUINFER_ENABLE_AVX512_VNNI=OFF
|
||||
export CPUINFER_ENABLE_AVX512_BF16=OFF
|
||||
./install.sh
|
||||
```
|
||||
|
||||
运行 `./install.sh --help` 查看所有可用选项。
|
||||
|
||||
---
|
||||
|
||||
## 构建配置
|
||||
|
||||
### 手动安装
|
||||
### 手动安装(不使用 install.sh)
|
||||
|
||||
如果你不想使用 `install.sh`,可以按以下步骤手动构建:
|
||||
如果你不想使用 `install.sh` 脚本:
|
||||
|
||||
#### 1. 安装系统依赖
|
||||
|
||||
@@ -447,24 +467,25 @@ KTMoEWrapper.clear_buffer_cache()
|
||||
|
||||
**指令集说明:**
|
||||
|
||||
- **`NATIVE`**:自动检测并启用所有可用 CPU 指令(`-march=native`)——**本机运行时首选**
|
||||
- **`AVX512`**:为 Skylake-SP / Cascade Lake 显式开启 AVX512
|
||||
- **`AVX2`**:开启 AVX2,兼容性较好
|
||||
- **`FANCY`**:开启完整 AVX512 扩展(AVX512F/BW/DQ/VL/VNNI),适用于 Ice Lake+ 和 Zen 4+ 等较新平台。
|
||||
用于向用户分发预编译二进制时推荐;本地构建推荐使用 `NATIVE` 以获得更优性能。
|
||||
| 选项 | 目标 CPU | 使用场景 |
|
||||
|------|----------|----------|
|
||||
| **`NATIVE`** | 仅限你的特定 CPU | 本地构建(最佳性能,**默认**) |
|
||||
| **`AVX512`** | Skylake-X、Ice Lake、Cascade Lake、Zen 4+ | 通用分发 |
|
||||
| **`AVX2`** | Haswell (2013) 及更新 | 最大兼容性 |
|
||||
| **`FANCY`** | Ice Lake+、Zen 4+ | 具有完整 AVX512 扩展的现代 CPU |
|
||||
|
||||
**配置示例:**
|
||||
|
||||
```bash
|
||||
# 在 AMX CPU 上获得最高性能
|
||||
# 本地使用 - 最高性能(默认行为)
|
||||
export CPUINFER_CPU_INSTRUCT=NATIVE
|
||||
export CPUINFER_ENABLE_AMX=ON
|
||||
export CPUINFER_ENABLE_AMX=ON # 或 OFF
|
||||
|
||||
# 仅 AVX512,无 AMX
|
||||
# 分发构建 - 适用于任何 AVX512 CPU
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
|
||||
# 兼容性优先构建
|
||||
# 最大兼容性 - 适用于 2013 年以来的 CPU
|
||||
export CPUINFER_CPU_INSTRUCT=AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
|
||||
|
||||
@@ -229,8 +229,7 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
|
||||
|
||||
auto moe_cls = py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name);
|
||||
|
||||
moe_cls
|
||||
.def(py::init<GeneralMOEConfig>())
|
||||
moe_cls.def(py::init<GeneralMOEConfig>())
|
||||
.def("warm_up_task", &MoeBindings::WarmUpBindings::cpuinfer_interface)
|
||||
.def("load_weights_task",
|
||||
py::overload_cast<std::shared_ptr<MoeClass>>(&MoeBindings::LoadWeightsBindings::cpuinfer_interface))
|
||||
@@ -265,16 +264,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
|
||||
|
||||
static void inner(void* args) {
|
||||
Args* args_ = (Args*)args;
|
||||
args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe,
|
||||
args_->gpu_tp_count, args_->gpu_experts_num,
|
||||
args_->w13_weight_ptrs, args_->w13_scale_ptrs,
|
||||
args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe, args_->gpu_tp_count,
|
||||
args_->gpu_experts_num, args_->w13_weight_ptrs, args_->w13_scale_ptrs,
|
||||
args_->w2_weight_ptrs, args_->w2_scale_ptrs);
|
||||
}
|
||||
|
||||
static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe,
|
||||
int gpu_tp_count, int gpu_experts_num,
|
||||
py::list w13_weight_ptrs, py::list w13_scale_ptrs,
|
||||
py::list w2_weight_ptrs, py::list w2_scale_ptrs) {
|
||||
static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe, int gpu_tp_count,
|
||||
int gpu_experts_num, py::list w13_weight_ptrs,
|
||||
py::list w13_scale_ptrs, py::list w2_weight_ptrs,
|
||||
py::list w2_scale_ptrs) {
|
||||
// Convert Python lists to std::vector<uintptr_t>
|
||||
std::vector<uintptr_t> w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec;
|
||||
|
||||
@@ -283,16 +281,15 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
|
||||
for (auto item : w2_weight_ptrs) w2_weight_vec.push_back(py::cast<uintptr_t>(item));
|
||||
for (auto item : w2_scale_ptrs) w2_scale_vec.push_back(py::cast<uintptr_t>(item));
|
||||
|
||||
Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
|
||||
Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
|
||||
w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec};
|
||||
return std::make_pair((intptr_t)&inner, (intptr_t)args);
|
||||
}
|
||||
};
|
||||
|
||||
moe_cls.def("write_weight_scale_to_buffer_task", &WriteWeightScaleToBufferBindings::cpuinfer_interface,
|
||||
py::arg("gpu_tp_count"), py::arg("gpu_experts_num"),
|
||||
py::arg("w13_weight_ptrs"), py::arg("w13_scale_ptrs"),
|
||||
py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
|
||||
py::arg("gpu_tp_count"), py::arg("gpu_experts_num"), py::arg("w13_weight_ptrs"),
|
||||
py::arg("w13_scale_ptrs"), py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -19,41 +19,65 @@ BUILD_OPTIONS (for "build" or "all"):
|
||||
--no-clean Do not delete local build/ before building (default cleans)
|
||||
|
||||
AUTO-DETECTION (Default):
|
||||
The script will automatically detect your CPU capabilities and configure:
|
||||
- If AMX instructions detected → NATIVE + AMX=ON
|
||||
- Otherwise → NATIVE + AMX=OFF
|
||||
The script will automatically detect your CPU and use ALL available features:
|
||||
- CPUINFER_CPU_INSTRUCT = NATIVE (uses -march=native)
|
||||
- CPUINFER_ENABLE_AMX = ON/OFF (based on detection)
|
||||
- CPUINFER_ENABLE_AVX512_VNNI = ON/OFF (with fallback if OFF)
|
||||
- CPUINFER_ENABLE_AVX512_BF16 = ON/OFF (with fallback if OFF)
|
||||
|
||||
✓ Best performance on YOUR machine
|
||||
✗ Binary may NOT work on different/older CPUs
|
||||
|
||||
Use this when: Installing for local use only
|
||||
|
||||
MANUAL CONFIGURATION:
|
||||
Use --manual flag and set these environment variables before running:
|
||||
Use --manual flag when building for DISTRIBUTION or different machines.
|
||||
Set these environment variables before running:
|
||||
|
||||
CPUINFER_CPU_INSTRUCT - CPU instruction set
|
||||
Options: NATIVE, AVX512, AVX2, FANCY
|
||||
CPUINFER_CPU_INSTRUCT - Target CPU instruction set
|
||||
Options: AVX512, AVX2, FANCY, NATIVE
|
||||
CPUINFER_ENABLE_AMX - Enable Intel AMX support
|
||||
Options: ON, OFF
|
||||
|
||||
Manual configuration examples:
|
||||
Distribution examples (portable binaries):
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────────┐
|
||||
│ Configuration │ Use Case │
|
||||
├──────────────────────────────────┼──────────────────────────────────────┤
|
||||
│ NATIVE + AMX=ON │ Best performance on AMX CPUs │
|
||||
│ AVX512 + AMX=OFF │ AVX512 CPUs without AMX │
|
||||
│ AVX2 + AMX=OFF │ Older CPUs or maximum compatibility │
|
||||
└──────────────────────────────────┴──────────────────────────────────────┘
|
||||
┌──────────────────────────────────────────────────────────────────────────┐
|
||||
│ Configuration │ Target CPUs │ Use Case │
|
||||
├────────────────────────┼──────────────────────────┼──────────────────────┤
|
||||
│ AVX512 + AMX=OFF │ Skylake-X, Ice Lake, │ General distribution │
|
||||
│ │ Cascade Lake, Zen 4 │ (recommended) │
|
||||
├────────────────────────┼──────────────────────────┼──────────────────────┤
|
||||
│ AVX2 + AMX=OFF │ Haswell (2013) and newer │ Maximum compatibility│
|
||||
├────────────────────────┼──────────────────────────┼──────────────────────┤
|
||||
│ FANCY + AMX=OFF │ Ice Lake+, Zen 4+ │ Modern CPUs only │
|
||||
│ │ (with full AVX512 ext) │ │
|
||||
└────────────────────────┴──────────────────────────┴──────────────────────┘
|
||||
|
||||
Example manual build:
|
||||
Use this when: Building Docker images, PyPI packages, or deploying to clusters
|
||||
|
||||
Example: Build for general distribution
|
||||
export CPUINFER_CPU_INSTRUCT=AVX512
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
$0 build --manual
|
||||
# Result: Works on any CPU with AVX512 (2017+)
|
||||
|
||||
Advanced option (for binary distribution):
|
||||
FANCY - AVX512 with full extensions for Ice Lake+/Zen 4+
|
||||
Use this when building pre-compiled binaries to distribute.
|
||||
Example: Build for maximum compatibility
|
||||
export CPUINFER_CPU_INSTRUCT=AVX2
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
$0 build --manual
|
||||
# Result: Works on any CPU with AVX2 (2013+)
|
||||
|
||||
Optional variables (with defaults):
|
||||
CPUINFER_BUILD_TYPE=Release Build type (Debug/RelWithDebInfo/Release)
|
||||
CPUINFER_PARALLEL=8 Number of parallel build jobs
|
||||
CPUINFER_VERBOSE=1 Verbose build output (0/1)
|
||||
CPUINFER_BUILD_TYPE=Release Build type (Debug/RelWithDebInfo/Release)
|
||||
CPUINFER_PARALLEL=8 Number of parallel build jobs
|
||||
CPUINFER_VERBOSE=1 Verbose build output (0/1)
|
||||
CPUINFER_ENABLE_AVX512_VNNI=ON/OFF Override VNNI detection (auto if unset)
|
||||
CPUINFER_ENABLE_AVX512_BF16=ON/OFF Override BF16 detection (auto if unset)
|
||||
|
||||
Software Fallback Support:
|
||||
✓ If VNNI not available: Uses AVX512BW fallback (2-3x slower but works)
|
||||
✓ If BF16 not available: Uses AVX512F fallback (5-10x slower but works)
|
||||
→ Old CPUs with only AVX512F+BW can run all code (slower but functional)
|
||||
|
||||
EOF
|
||||
exit 1
|
||||
@@ -120,20 +144,38 @@ install_dependencies() {
|
||||
}
|
||||
|
||||
# Function to detect CPU features
|
||||
# Returns: "has_amx has_avx512_vnni has_avx512_bf16" (space-separated 0/1 values)
|
||||
detect_cpu_features() {
|
||||
local has_amx=0
|
||||
local has_avx512_vnni=0
|
||||
local has_avx512_bf16=0
|
||||
|
||||
if [ -f /proc/cpuinfo ]; then
|
||||
local cpu_flags
|
||||
cpu_flags=$(grep -m1 "^flags" /proc/cpuinfo | tr ' ' '\n')
|
||||
|
||||
# Check for AMX support on Linux
|
||||
if grep -q "amx_tile\|amx_int8\|amx_bf16" /proc/cpuinfo; then
|
||||
if echo "$cpu_flags" | grep -qE "amx_tile|amx_int8|amx_bf16"; then
|
||||
has_amx=1
|
||||
fi
|
||||
|
||||
# Check for AVX512_VNNI support
|
||||
if echo "$cpu_flags" | grep -qE "avx512_vnni|avx512vnni"; then
|
||||
has_avx512_vnni=1
|
||||
fi
|
||||
|
||||
# Check for AVX512_BF16 support
|
||||
if echo "$cpu_flags" | grep -qE "avx512_bf16|avx512bf16"; then
|
||||
has_avx512_bf16=1
|
||||
fi
|
||||
elif [ "$(uname)" = "Darwin" ]; then
|
||||
# macOS doesn't have AMX (ARM or Intel without AMX)
|
||||
has_amx=0
|
||||
has_avx512_vnni=0
|
||||
has_avx512_bf16=0
|
||||
fi
|
||||
|
||||
echo "$has_amx"
|
||||
echo "$has_amx $has_avx512_vnni $has_avx512_bf16"
|
||||
}
|
||||
|
||||
build_step() {
|
||||
@@ -161,34 +203,6 @@ build_step() {
|
||||
echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
|
||||
fi
|
||||
|
||||
# Check for multi-variant build mode (Docker environment)
|
||||
if [ "${CPUINFER_BUILD_ALL_VARIANTS:-0}" = "1" ]; then
|
||||
echo "=========================================="
|
||||
echo "Building ALL CPU variants (AMX/AVX512/AVX2)"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "This will build three variants in a single wheel:"
|
||||
echo " - AMX variant (Intel Sapphire Rapids+)"
|
||||
echo " - AVX512 variant (Intel Skylake-X/Ice Lake+)"
|
||||
echo " - AVX2 variant (maximum compatibility)"
|
||||
echo ""
|
||||
echo "Runtime CPU detection will automatically select the best variant."
|
||||
echo ""
|
||||
|
||||
export CPUINFER_FORCE_REBUILD=1
|
||||
export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
|
||||
export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
|
||||
|
||||
echo "Building with:"
|
||||
echo " CPUINFER_BUILD_ALL_VARIANTS=1"
|
||||
echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
|
||||
echo " CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
|
||||
echo ""
|
||||
|
||||
pip install . -v
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "$MANUAL_MODE" = "0" ]; then
|
||||
# Auto-detection mode
|
||||
echo "=========================================="
|
||||
@@ -196,25 +210,70 @@ build_step() {
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
HAS_AMX=$(detect_cpu_features)
|
||||
# detect_cpu_features returns "has_amx has_avx512_vnni has_avx512_bf16"
|
||||
CPU_FEATURES=$(detect_cpu_features)
|
||||
HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
|
||||
HAS_AVX512_VNNI=$(echo "$CPU_FEATURES" | cut -d' ' -f2)
|
||||
HAS_AVX512_BF16=$(echo "$CPU_FEATURES" | cut -d' ' -f3)
|
||||
|
||||
export CPUINFER_CPU_INSTRUCT=NATIVE
|
||||
|
||||
if [ "$HAS_AMX" = "1" ]; then
|
||||
echo "✓ AMX instructions detected"
|
||||
export CPUINFER_CPU_INSTRUCT=NATIVE
|
||||
export CPUINFER_ENABLE_AMX=ON
|
||||
echo " Configuration: NATIVE + AMX=ON (best performance)"
|
||||
echo ""
|
||||
echo " ⚠️ Note: If you plan to use LLAMAFILE backend, use manual mode:"
|
||||
echo " export CPUINFER_CPU_INSTRUCT=AVX512 # or AVX2/FANCY"
|
||||
echo " export CPUINFER_ENABLE_AMX=OFF"
|
||||
echo " ./install.sh build --manual"
|
||||
echo "Configuration: NATIVE + AMX=ON"
|
||||
echo " ✓ Best performance on this machine"
|
||||
echo " ✗ Binary requires Sapphire Rapids or newer CPU"
|
||||
else
|
||||
echo "ℹ AMX instructions not detected"
|
||||
export CPUINFER_CPU_INSTRUCT=NATIVE
|
||||
export CPUINFER_ENABLE_AMX=OFF
|
||||
echo " Configuration: NATIVE + AMX=OFF"
|
||||
echo ""
|
||||
echo "Configuration: NATIVE + AMX=OFF"
|
||||
echo " ✓ Using AVX512/AVX2 instructions"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo " ⚠️ IMPORTANT: This binary is optimized for THIS CPU only"
|
||||
echo " To build portable binaries for distribution, use:"
|
||||
echo " export CPUINFER_CPU_INSTRUCT=AVX512 # or AVX2"
|
||||
echo " export CPUINFER_ENABLE_AMX=OFF"
|
||||
echo " ./install.sh build --manual"
|
||||
|
||||
# Fine-grained AVX512 subset detection (with fallback support)
|
||||
echo ""
|
||||
echo "AVX512 Feature Detection:"
|
||||
|
||||
# VNNI: Check if user manually set it, otherwise auto-detect
|
||||
if [ -n "${CPUINFER_ENABLE_AVX512_VNNI:-}" ]; then
|
||||
echo " VNNI: User override = $CPUINFER_ENABLE_AVX512_VNNI"
|
||||
else
|
||||
if [ "$HAS_AVX512_VNNI" = "1" ]; then
|
||||
echo " VNNI: ✓ Detected (hardware acceleration enabled)"
|
||||
export CPUINFER_ENABLE_AVX512_VNNI=ON
|
||||
else
|
||||
echo " VNNI: ✗ Not detected (will use software fallback, 2-3x slower)"
|
||||
export CPUINFER_ENABLE_AVX512_VNNI=OFF
|
||||
fi
|
||||
fi
|
||||
|
||||
# BF16: Check if user manually set it, otherwise auto-detect
|
||||
if [ -n "${CPUINFER_ENABLE_AVX512_BF16:-}" ]; then
|
||||
echo " BF16: User override = $CPUINFER_ENABLE_AVX512_BF16"
|
||||
else
|
||||
if [ "$HAS_AVX512_BF16" = "1" ]; then
|
||||
echo " BF16: ✓ Detected (hardware acceleration enabled)"
|
||||
export CPUINFER_ENABLE_AVX512_BF16=ON
|
||||
else
|
||||
echo " BF16: ✗ Not detected (will use software fallback, 5-10x slower)"
|
||||
export CPUINFER_ENABLE_AVX512_BF16=OFF
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo " Note: Software fallbacks ensure all code works on older CPUs"
|
||||
echo " Tip: Override with CPUINFER_ENABLE_AVX512_VNNI/BF16=ON/OFF"
|
||||
|
||||
echo ""
|
||||
echo "To use manual configuration instead, run: $0 build --manual"
|
||||
echo ""
|
||||
@@ -250,12 +309,32 @@ build_step() {
|
||||
|
||||
# Warn about problematic configuration
|
||||
if [ "$CPUINFER_CPU_INSTRUCT" = "NATIVE" ] && [ "$CPUINFER_ENABLE_AMX" = "OFF" ]; then
|
||||
HAS_AMX=$(detect_cpu_features)
|
||||
CPU_FEATURES=$(detect_cpu_features)
|
||||
HAS_AMX=$(echo "$CPU_FEATURES" | cut -d' ' -f1)
|
||||
if [ "$HAS_AMX" = "1" ]; then
|
||||
echo "⚠️ WARNING: NATIVE + AMX=OFF on AMX-capable CPU may cause compilation issues!"
|
||||
echo " Recommended: Use AVX512 or AVX2 instead of NATIVE when AMX=OFF"
|
||||
echo "=========================================="
|
||||
echo "⚠️ WARNING: Risky Configuration"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
read -p "Continue anyway? (y/N) " -n 1 -r
|
||||
echo "Your configuration:"
|
||||
echo " CPUINFER_CPU_INSTRUCT = NATIVE"
|
||||
echo " CPUINFER_ENABLE_AMX = OFF"
|
||||
echo ""
|
||||
echo "Your CPU HAS AMX support!"
|
||||
echo ""
|
||||
echo "Problem:"
|
||||
echo " • NATIVE uses -march=native which auto-enables ALL CPU features"
|
||||
echo " • This may IGNORE your AMX=OFF setting"
|
||||
echo " • The binary may still contain AMX instructions"
|
||||
echo ""
|
||||
echo "Recommended fixes:"
|
||||
echo " 1) For portable build (recommended for distribution):"
|
||||
echo " export CPUINFER_CPU_INSTRUCT=AVX512"
|
||||
echo ""
|
||||
echo " 2) If you want best performance on this CPU:"
|
||||
echo " export CPUINFER_ENABLE_AMX=ON"
|
||||
echo ""
|
||||
read -p "Continue with risky configuration? (y/N) " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
exit 1
|
||||
@@ -271,12 +350,15 @@ export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
|
||||
export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
|
||||
export CPUINFER_VERBOSE=${CPUINFER_VERBOSE:-1}
|
||||
|
||||
echo "=========================================="
|
||||
echo "Building kt-kernel with configuration:"
|
||||
echo " CPUINFER_CPU_INSTRUCT=$CPUINFER_CPU_INSTRUCT"
|
||||
echo " CPUINFER_ENABLE_AMX=$CPUINFER_ENABLE_AMX"
|
||||
echo " CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
|
||||
echo " CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
|
||||
echo " CPUINFER_VERBOSE=$CPUINFER_VERBOSE"
|
||||
echo "=========================================="
|
||||
echo " CPUINFER_CPU_INSTRUCT = $CPUINFER_CPU_INSTRUCT"
|
||||
echo " CPUINFER_ENABLE_AMX = $CPUINFER_ENABLE_AMX"
|
||||
echo " CPUINFER_ENABLE_AVX512_VNNI = ${CPUINFER_ENABLE_AVX512_VNNI:-AUTO}"
|
||||
echo " CPUINFER_ENABLE_AVX512_BF16 = ${CPUINFER_ENABLE_AVX512_BF16:-AUTO}"
|
||||
echo " CPUINFER_BUILD_TYPE = $CPUINFER_BUILD_TYPE"
|
||||
echo " CPUINFER_PARALLEL = $CPUINFER_PARALLEL"
|
||||
echo ""
|
||||
|
||||
pip install . -v
|
||||
|
||||
@@ -94,10 +94,10 @@ class AMX_K2_MOE_TP {
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void dump_buffer_b(const std::string &quantization_type, int expert_idx, const std::string &matrix_type,
|
||||
typename T::BufferB *buffer) {
|
||||
auto &quant_config = config_.quant_config;
|
||||
int &group_size = quant_config.group_size;
|
||||
inline void dump_buffer_b(const std::string& quantization_type, int expert_idx, const std::string& matrix_type,
|
||||
typename T::BufferB* buffer) {
|
||||
auto& quant_config = config_.quant_config;
|
||||
int& group_size = quant_config.group_size;
|
||||
|
||||
printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
|
||||
matrix_type.c_str());
|
||||
@@ -110,7 +110,7 @@ class AMX_K2_MOE_TP {
|
||||
cols = config_.hidden_size;
|
||||
num_groups = cols / group_size;
|
||||
scale_elem_count = num_groups * rows;
|
||||
} else { // down
|
||||
} else { // down
|
||||
rows = config_.hidden_size;
|
||||
cols = config_.intermediate_size;
|
||||
num_groups = cols / group_size;
|
||||
@@ -133,8 +133,8 @@ class AMX_K2_MOE_TP {
|
||||
printf("\n");
|
||||
}
|
||||
// Dump quantized weights (as hex uint8)
|
||||
size_t weight_size = (rows * cols) / 2; // INT4 packed
|
||||
uint8_t *weight_ptr = (uint8_t *)buffer->b;
|
||||
size_t weight_size = (rows * cols) / 2; // INT4 packed
|
||||
uint8_t* weight_ptr = (uint8_t*)buffer->b;
|
||||
|
||||
printf(" Weights[first 32 bytes]: ");
|
||||
for (int i = 0; i < std::min(32, (int)weight_size); i++) {
|
||||
@@ -232,10 +232,12 @@ class AMX_K2_MOE_TP {
|
||||
// (config_.expert_num * T::BufferA::M_STEP) in pool_count_ is to ensure padding for each experts.
|
||||
pool_count_ = config_.max_len * config_.num_experts_per_tok + config_.expert_num * T::BufferA::M_STEP;
|
||||
|
||||
gate_up_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
|
||||
gate_up_ba_pool_bytes_ =
|
||||
(T::BufferA::required_size(pool_count_, config_.hidden_size, group_size)) + pool_count_ * 64;
|
||||
gate_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
|
||||
up_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.intermediate_size)) + pool_count_ * 64;
|
||||
down_ba_pool_bytes_ = (T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
|
||||
down_ba_pool_bytes_ =
|
||||
(T::BufferA::required_size(pool_count_, config_.intermediate_size, group_size)) + pool_count_ * 64;
|
||||
down_bc_pool_bytes_ = (T::BufferC::required_size(pool_count_, config_.hidden_size)) + pool_count_ * 64;
|
||||
|
||||
mem_requests.append_pointer(&gate_up_ba_pool_, gate_up_ba_pool_bytes_);
|
||||
@@ -276,8 +278,7 @@ class AMX_K2_MOE_TP {
|
||||
ith, nth);
|
||||
// up part
|
||||
up_bb_[expert_idx]->from_raw_mat(
|
||||
(uint8_t*)config_.up_proj +
|
||||
((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
|
||||
(uint8_t*)config_.up_proj + ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
|
||||
ith, nth);
|
||||
},
|
||||
nullptr);
|
||||
@@ -302,19 +303,15 @@ class AMX_K2_MOE_TP {
|
||||
[this, physical_to_logical_map](int task_id) {
|
||||
uint64_t expert_idx = task_id;
|
||||
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
|
||||
size_t scale_elem_count =
|
||||
(config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
|
||||
size_t scale_elem_count = (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
|
||||
|
||||
// convert scales from BF16 to FP32
|
||||
convert_or_copy(gate_bb_[expert_idx]->d,
|
||||
(ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count),
|
||||
scale_elem_count);
|
||||
(ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
|
||||
convert_or_copy(up_bb_[expert_idx]->d,
|
||||
(ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count),
|
||||
scale_elem_count);
|
||||
(ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
|
||||
convert_or_copy(down_bb_[expert_idx]->d,
|
||||
(ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count),
|
||||
scale_elem_count);
|
||||
(ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count), scale_elem_count);
|
||||
},
|
||||
nullptr);
|
||||
// dump_buffer_b("native", 0, "down", down_bb_[0].get());
|
||||
@@ -323,10 +320,10 @@ class AMX_K2_MOE_TP {
|
||||
// Reconstruct weights for all experts to the output buffers
|
||||
// This function handles the TP-specific portion of the reconstruction for all experts
|
||||
void write_weights_to_buffer(int gpu_tp_count, int cpu_tp_count, int num_experts, const GeneralMOEConfig& full_config,
|
||||
const std::vector<uintptr_t>& w13_weight_ptrs,
|
||||
const std::vector<uintptr_t>& w13_scale_ptrs,
|
||||
const std::vector<uintptr_t>& w2_weight_ptrs,
|
||||
const std::vector<uintptr_t>& w2_scale_ptrs) const {
|
||||
const std::vector<uintptr_t>& w13_weight_ptrs,
|
||||
const std::vector<uintptr_t>& w13_scale_ptrs,
|
||||
const std::vector<uintptr_t>& w2_weight_ptrs,
|
||||
const std::vector<uintptr_t>& w2_scale_ptrs) const {
|
||||
const int group_size = config_.quant_config.group_size;
|
||||
auto pool = config_.pool->get_subpool(tp_part_idx);
|
||||
|
||||
@@ -379,18 +376,19 @@ class AMX_K2_MOE_TP {
|
||||
// Gate (first part of w13 for this expert)
|
||||
uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b;
|
||||
float* gate_scale_src = gate_bb_[expert_id]->d;
|
||||
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight,
|
||||
gate_weight_src, cpu_tp_weight_bytes);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale),
|
||||
gate_scale_src, cpu_tp_scale_elem_count);
|
||||
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight, gate_weight_src,
|
||||
cpu_tp_weight_bytes);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale), gate_scale_src,
|
||||
cpu_tp_scale_elem_count);
|
||||
|
||||
// Up (second part of w13 for this expert, immediately after gate)
|
||||
uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b;
|
||||
float* up_scale_src = up_bb_[expert_id]->d;
|
||||
std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight + gpu_tp_weight_bytes,
|
||||
up_weight_src, cpu_tp_weight_bytes);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
|
||||
up_scale_src, cpu_tp_scale_elem_count);
|
||||
up_weight_src, cpu_tp_weight_bytes);
|
||||
convert_or_copy(
|
||||
(ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
|
||||
up_scale_src, cpu_tp_scale_elem_count);
|
||||
|
||||
// Down (w2) - need to handle column-wise slicing
|
||||
// The down matrix is transposed compared to gate/up, so we need to extract by columns
|
||||
@@ -406,17 +404,16 @@ class AMX_K2_MOE_TP {
|
||||
size_t gpu_col_slice_offset = local_idx * (config_.intermediate_size >> 1);
|
||||
|
||||
std::memcpy(w2_weight_dst + w2_expert_base_weight + gpu_col_offset + gpu_col_slice_offset,
|
||||
(uint8_t*)down_bb_[expert_id]->b + cpu_col_offset,
|
||||
config_.intermediate_size / 2);
|
||||
(uint8_t*)down_bb_[expert_id]->b + cpu_col_offset, config_.intermediate_size / 2);
|
||||
|
||||
// Same for scales
|
||||
size_t gpu_scale_col_offset = col * ((full_config.intermediate_size / gpu_tp_count) / group_size);
|
||||
size_t cpu_scale_col_offset = col * (config_.intermediate_size / group_size);
|
||||
size_t gpu_scale_slice_offset = local_idx * (config_.intermediate_size / group_size);
|
||||
|
||||
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
|
||||
down_bb_[expert_id]->d + cpu_scale_col_offset,
|
||||
config_.intermediate_size / group_size);
|
||||
convert_or_copy(
|
||||
(ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
|
||||
down_bb_[expert_id]->d + cpu_scale_col_offset, config_.intermediate_size / group_size);
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
@@ -460,16 +457,15 @@ class AMX_K2_MOE_TP {
|
||||
// Gate (first part of w13 for this expert)
|
||||
uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b + cpu_offset_weight;
|
||||
float* gate_scale_src = gate_bb_[expert_id]->d + cpu_offset_scale;
|
||||
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight,
|
||||
gate_weight_src, data_per_gpu_tp_weight);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale),
|
||||
gate_scale_src, data_per_gpu_tp_scale);
|
||||
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight, gate_weight_src, data_per_gpu_tp_weight);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale), gate_scale_src,
|
||||
data_per_gpu_tp_scale);
|
||||
|
||||
// Up (second part of w13 for this expert, immediately after gate)
|
||||
uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b + cpu_offset_weight;
|
||||
float* up_scale_src = up_bb_[expert_id]->d + cpu_offset_scale;
|
||||
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes,
|
||||
up_weight_src, data_per_gpu_tp_weight);
|
||||
std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes, up_weight_src,
|
||||
data_per_gpu_tp_weight);
|
||||
convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale + gpu_tp_scale_elem_count),
|
||||
up_scale_src, data_per_gpu_tp_scale);
|
||||
|
||||
@@ -477,16 +473,20 @@ class AMX_K2_MOE_TP {
|
||||
// The down matrix is transposed compared to gate/up, so we need to extract by columns
|
||||
for (size_t col = 0; col < config_.hidden_size; col++) {
|
||||
// Calculate the offset within the column for this GPU TP part
|
||||
size_t col_offset_weight = (col * config_.intermediate_size / 2) + (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
|
||||
size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) + (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
|
||||
size_t col_offset_weight = (col * config_.intermediate_size / 2) +
|
||||
(local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
|
||||
size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) +
|
||||
(local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
|
||||
|
||||
// Copy weights column by column
|
||||
std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight + (col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
|
||||
std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight +
|
||||
(col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
|
||||
(uint8_t*)down_bb_[expert_id]->b + col_offset_weight,
|
||||
(config_.intermediate_size / gpu_tps_per_cpu_tp) / 2);
|
||||
|
||||
// Copy scales column by column
|
||||
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale + col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
|
||||
convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale +
|
||||
col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
|
||||
down_bb_[expert_id]->d + col_offset_scale,
|
||||
(config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size);
|
||||
}
|
||||
@@ -587,8 +587,7 @@ class AMX_K2_MOE_TP {
|
||||
m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
|
||||
offset += m_local_num_[i];
|
||||
|
||||
if (m_local_num_[i] == 0)
|
||||
continue;
|
||||
if (m_local_num_[i] == 0) continue;
|
||||
size_t max_m = (m_local_num_[i] + M_STEP - 1) / M_STEP * M_STEP;
|
||||
gate_up_ba_[i]->max_m = max_m;
|
||||
gate_up_ba_[i]->set_data(gate_up_ba_pool_ptr);
|
||||
@@ -801,7 +800,8 @@ class AMX_K2_MOE_TP {
|
||||
down_time, weight_time, forward_total_time, max_local_num, qlen);
|
||||
#endif
|
||||
// for (int i = 0; i < qlen; i ++)
|
||||
// forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size, (float*)output + i * config_.hidden_size);
|
||||
// forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size,
|
||||
// (float*)output + i * config_.hidden_size);
|
||||
}
|
||||
|
||||
void forward_decode(int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
|
||||
@@ -826,7 +826,7 @@ class AMX_K2_MOE_TP {
|
||||
m_expert_id_map_[activated_expert] = expert_ids[i];
|
||||
activated_expert++;
|
||||
}
|
||||
|
||||
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < activated_expert; i++) {
|
||||
auto expert_idx = m_expert_id_map_[i];
|
||||
@@ -912,9 +912,9 @@ class AMX_K2_MOE_TP {
|
||||
amx::vec_mul_kgroup(qlen, config_.intermediate_size, config_.hidden_size, group_size, gate_up_ba_[0],
|
||||
gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth);
|
||||
gate_bc_[expert_idx]->to_mat(qlen, m_local_gate_output_ptr_[expert_idx], ith, nth);
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
|
||||
#ifdef DEBUG_K2_MOE
|
||||
if (activated_expert > 0) {
|
||||
@@ -971,7 +971,6 @@ class AMX_K2_MOE_TP {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef FORWARD_TIME_PROFILE
|
||||
{
|
||||
auto now_time = std::chrono::high_resolution_clock::now();
|
||||
@@ -1056,9 +1055,9 @@ class AMX_K2_MOE_TP {
|
||||
}
|
||||
__m512 weight = _mm512_set1_ps(weights[j]);
|
||||
__m512 down_output0, down_output1;
|
||||
avx512_32xbf16_to_32xfp32((__m512i*)(m_local_down_output_ptr_[expert_ids[j]] +
|
||||
m_local_pos_[0][j] * config_.hidden_size + e),
|
||||
&down_output0, &down_output1);
|
||||
avx512_32xbf16_to_32xfp32(
|
||||
(__m512i*)(m_local_down_output_ptr_[expert_ids[j]] + m_local_pos_[0][j] * config_.hidden_size + e),
|
||||
&down_output0, &down_output1);
|
||||
x0 = _mm512_fmadd_ps(down_output0, weight, x0);
|
||||
x1 = _mm512_fmadd_ps(down_output1, weight, x1);
|
||||
}
|
||||
@@ -1151,28 +1150,26 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
|
||||
|
||||
// TP-slicing for gate and up (row-major slicing)
|
||||
memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
|
||||
src_gate + ((i * weight_elem_count) >> 1),
|
||||
(weight_elem_count >> 1));
|
||||
src_gate + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));
|
||||
|
||||
memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
|
||||
src_up + ((i * weight_elem_count) >> 1),
|
||||
(weight_elem_count >> 1));
|
||||
src_up + ((i * weight_elem_count) >> 1), (weight_elem_count >> 1));
|
||||
|
||||
memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
|
||||
src_gate_scale + (i * scales_elem_count),
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
src_gate_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
|
||||
memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
|
||||
src_up_scale + (i * scales_elem_count),
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
src_up_scale + (i * scales_elem_count), sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
|
||||
// TP-slicing for down (by column)
|
||||
for (size_t col = 0; col < config.hidden_size; col++) {
|
||||
memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
|
||||
src_down + ((col * config.intermediate_size + i * tpc.intermediate_size) >> 1),
|
||||
(tpc.intermediate_size >> 1));
|
||||
memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
|
||||
src_down_scale + (col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
|
||||
memcpy((ggml_bf16_t*)tpc.down_scale +
|
||||
(expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
|
||||
src_down_scale +
|
||||
(col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
|
||||
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
|
||||
}
|
||||
},
|
||||
@@ -1197,43 +1194,45 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
|
||||
if (tps[i]->config_.load == false) {
|
||||
pool->get_subpool(i)->do_work_stealing_job(
|
||||
tpc.expert_num, nullptr,
|
||||
[&](int expert_id_) { // weight and scale are all in col majored.
|
||||
[&](int expert_id_) { // weight and scale are all in col majored.
|
||||
size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
|
||||
|
||||
// weight and scale TP-slicing for gate and up
|
||||
memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
|
||||
(uint8_t*)config.gate_proj +
|
||||
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
|
||||
((sizeof(uint8_t) * weight_elem_count) >> 1));
|
||||
(uint8_t*)config.gate_proj +
|
||||
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
|
||||
((sizeof(uint8_t) * weight_elem_count) >> 1));
|
||||
|
||||
memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
|
||||
(uint8_t*)config.up_proj +
|
||||
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
|
||||
((sizeof(uint8_t) * weight_elem_count) >> 1));
|
||||
(uint8_t*)config.up_proj +
|
||||
((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
|
||||
((sizeof(uint8_t) * weight_elem_count) >> 1));
|
||||
|
||||
memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
|
||||
(ggml_bf16_t*)config.gate_scale +
|
||||
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
|
||||
(ggml_bf16_t*)config.gate_scale +
|
||||
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
|
||||
i * scales_elem_count),
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
|
||||
memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
|
||||
(ggml_bf16_t*)config.up_scale +
|
||||
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
|
||||
(ggml_bf16_t*)config.up_scale +
|
||||
(expert_id * (config.hidden_size / group_size) * config.intermediate_size +
|
||||
i * scales_elem_count),
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
sizeof(ggml_bf16_t) * scales_elem_count);
|
||||
|
||||
// weight and scale TP-slicing for down (by column)
|
||||
for (size_t col = 0; col < config.hidden_size; col++) {
|
||||
memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
|
||||
(uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
|
||||
(uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
|
||||
col * config.intermediate_size + i * tpc.intermediate_size) >>
|
||||
1),
|
||||
(sizeof(uint8_t) * tpc.intermediate_size) >> 1);
|
||||
memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
|
||||
(ggml_bf16_t*)config.down_scale + ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
|
||||
col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
|
||||
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
|
||||
1),
|
||||
(sizeof(uint8_t) * tpc.intermediate_size) >> 1);
|
||||
memcpy((ggml_bf16_t*)tpc.down_scale +
|
||||
(expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
|
||||
(ggml_bf16_t*)config.down_scale +
|
||||
((expert_id * (config.intermediate_size / group_size) * config.hidden_size) +
|
||||
col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
|
||||
sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
@@ -1245,7 +1244,8 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
|
||||
#ifdef LOAD_TIME_PROFILE
|
||||
{
|
||||
auto load_now_time = std::chrono::high_resolution_clock::now();
|
||||
alloc_and_tp_slice_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
|
||||
alloc_and_tp_slice_time =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
|
||||
load_last = load_now_time;
|
||||
}
|
||||
#endif
|
||||
@@ -1277,9 +1277,11 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
|
||||
cleanup_time = std::chrono::duration_cast<std::chrono::microseconds>(load_now_time - load_last).count();
|
||||
}
|
||||
auto load_end_time = std::chrono::high_resolution_clock::now();
|
||||
auto load_total_time = std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
|
||||
auto load_total_time =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(load_end_time - load_start_time).count();
|
||||
printf(
|
||||
"[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, total: %ld us\n",
|
||||
"[K2 MoE Load Weights] tp_count: %d, alloc_and_tp_slice: %ld us, tps_load_weights: %ld us, cleanup: %ld us, "
|
||||
"total: %ld us\n",
|
||||
tp_count, alloc_and_tp_slice_time, tps_load_time, cleanup_time, load_total_time);
|
||||
#endif
|
||||
|
||||
@@ -1307,15 +1309,13 @@ class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
|
||||
auto& config = this->config;
|
||||
auto pool = config.pool;
|
||||
// Each TP part writes to its corresponding buffer
|
||||
pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num,
|
||||
w13_weight_ptrs, w13_scale_ptrs, w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
|
||||
pool->dispense_backend()->do_numa_job([this, pool, gpu_tp_count, gpu_experts_num, w13_weight_ptrs, w13_scale_ptrs,
|
||||
w2_weight_ptrs, w2_scale_ptrs](int numa_id) {
|
||||
// Note: w13 combines gate and up projections
|
||||
// Split w13 pointers for gate and up
|
||||
this->tps[numa_id]->write_weights_to_buffer(
|
||||
gpu_tp_count, this->tp_count,
|
||||
gpu_experts_num, this->config,
|
||||
w13_weight_ptrs, w13_scale_ptrs, //gate + up use w13
|
||||
w2_weight_ptrs, w2_scale_ptrs); // down uses w2
|
||||
this->tps[numa_id]->write_weights_to_buffer(gpu_tp_count, this->tp_count, gpu_experts_num, this->config,
|
||||
w13_weight_ptrs, w13_scale_ptrs, // gate + up use w13
|
||||
w2_weight_ptrs, w2_scale_ptrs); // down uses w2
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -350,10 +350,7 @@ struct BufferAKGroupImpl {
|
||||
return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size);
|
||||
}
|
||||
|
||||
BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
|
||||
: max_m(max_m),
|
||||
k(k),
|
||||
k_group_size(k_group_size) {
|
||||
BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : max_m(max_m), k(k), k_group_size(k_group_size) {
|
||||
ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
|
||||
ASSERT_RELEASE(max_m % M_STEP == 0, "max_m must be multiple of M_STEP");
|
||||
ASSERT_RELEASE(k % K_STEP == 0, "k must be multiple of K_STEP");
|
||||
@@ -459,8 +456,7 @@ struct BufferASmallKGroupImpl : public BufferAKGroupImpl<K> {
|
||||
static constexpr int K_STEP = K::K_STEP;
|
||||
static constexpr int K_BLOCK = K::K_BLOCK;
|
||||
|
||||
BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
|
||||
: Base(max_m, k, k_group_size, ptr) {}
|
||||
BufferASmallKGroupImpl(int max_m, int k, int k_group_size, void* ptr) : Base(max_m, k, k_group_size, ptr) {}
|
||||
|
||||
// Override from_mat to write only 32 bytes per K_STEP iteration
|
||||
void from_mat(int m, ggml_bf16_t* src, int ith, int nth) {
|
||||
@@ -991,8 +987,8 @@ struct BufferBInt4WithZeroImpl {
|
||||
template <typename K>
|
||||
struct BufferBInt4KGroupImpl {
|
||||
using dt = typename K::dt;
|
||||
dt* b; // packed signed int4 weights, col majored
|
||||
float* d; // scales only (no mins/zero-points), row majored
|
||||
dt* b; // packed signed int4 weights, col majored
|
||||
float* d; // scales only (no mins/zero-points), row majored
|
||||
int n, k, k_group_size, k_group_count;
|
||||
|
||||
static constexpr int N_STEP = K::N_STEP;
|
||||
@@ -1009,8 +1005,8 @@ struct BufferBInt4KGroupImpl {
|
||||
assert(n % N_STEP == 0);
|
||||
assert(k % K_STEP == 0);
|
||||
if (n % N_STEP || k % K_STEP || k % k_group_size) {
|
||||
printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP,
|
||||
K_STEP, k_group_size);
|
||||
printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP, K_STEP,
|
||||
k_group_size);
|
||||
throw std::runtime_error("n or k is not aligned to N_STEP or K_STEP");
|
||||
}
|
||||
k_group_count = k / k_group_size;
|
||||
@@ -1043,8 +1039,8 @@ struct BufferBInt4KGroupImpl {
|
||||
|
||||
// Get scale pointer for a specific row and k_group
|
||||
float* get_scale(int n, int n_begin, int k, int k_begin) {
|
||||
int k_group_idx = k_begin / k_group_size;
|
||||
return d + n_begin * (k / k_group_size) + k_group_idx;
|
||||
int k_group_idx = k_begin / k_group_size;
|
||||
return d + n_begin * (k / k_group_size) + k_group_idx;
|
||||
}
|
||||
|
||||
// Split range for parallel processing
|
||||
|
||||
@@ -1552,9 +1552,9 @@ struct GemmKernel224Int4_1 {
|
||||
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
__m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
__m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2491,7 +2491,7 @@ struct GemmKernel224Int4_1KGroup {
|
||||
__m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
__m512i b512_lo = _mm512_slli_epi32(_mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]), 4);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2503,7 +2503,7 @@ struct GemmKernel224Int4_1KGroup {
|
||||
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
__m512i b512_hi = _mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2767,7 +2767,7 @@ struct GemmKernel224Int4_1_LowKGroup {
|
||||
__m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
__m512i b512_lo = _mm512_and_si512(K::lo_mask(), b512[n_i * 16 + k_i]);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_lo, ma_lo);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2779,7 +2779,7 @@ struct GemmKernel224Int4_1_LowKGroup {
|
||||
__m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
|
||||
for (int n_i = 0; n_i < 2; n_i++) {
|
||||
__m512i b512_hi = _mm512_srli_epi32(_mm512_and_si512(K::hi_mask(), b512[n_i * 16 + k_i]), 4);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
c512[m_i * 2 + n_i] = _mm512_dpbusd_epi32_compat(c512[m_i * 2 + n_i], b512_hi, ma_hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2850,7 +2850,7 @@ struct GemmKernel224Int4SmallKGroup {
|
||||
using output_t = int32_t;
|
||||
static constexpr double ELEMENT_SIZE = 0.5;
|
||||
static const int VNNI_BLK = 4;
|
||||
|
||||
|
||||
static const int M_STEP = 1;
|
||||
static const int N_STEP = 32;
|
||||
static const int K_STEP = 32;
|
||||
@@ -2870,18 +2870,15 @@ struct GemmKernel224Int4SmallKGroup {
|
||||
|
||||
alignas(64) static constexpr uint8_t hi_mask_arr[32] = {
|
||||
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
|
||||
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0
|
||||
};
|
||||
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0};
|
||||
|
||||
alignas(64) static constexpr uint8_t lo_mask_arr[32] = {
|
||||
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
|
||||
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
|
||||
};
|
||||
|
||||
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F};
|
||||
|
||||
alignas(64) static constexpr uint8_t sign_xor_arr[32] = {
|
||||
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
|
||||
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88
|
||||
};
|
||||
0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88};
|
||||
static __m256i hi_mask() { return *((__m256i*)(&hi_mask_arr[0])); }
|
||||
static __m256i lo_mask() { return *((__m256i*)(&lo_mask_arr[0])); }
|
||||
static __m256i sign_xor_mask() { return *((__m256i*)(&sign_xor_arr[0])); }
|
||||
@@ -2902,27 +2899,28 @@ struct GemmKernel224Int4SmallKGroup {
|
||||
const __m512i lane_shuffle = _mm512_set_epi64(7, 6, 3, 2, 5, 4, 1, 0);
|
||||
return _mm512_permutexvar_epi64(lane_shuffle, result);
|
||||
}
|
||||
static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB *bb, BufferC* bc, int ith, int nth) {
|
||||
static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB* bb,
|
||||
BufferC* bc, int ith, int nth) {
|
||||
auto [n_start, n_end] = split_range_n(n, ith, nth);
|
||||
for (int m_begin = 0; m_begin < m; m_begin ++) {
|
||||
for (int m_begin = 0; m_begin < m; m_begin++) {
|
||||
float* c = bc->get_submat(m, n, m_begin, n_start);
|
||||
__m512i* a512 = (__m512i*)ba->get_submat(m, k, m_begin, 0);
|
||||
|
||||
for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin ++) {
|
||||
|
||||
for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin++) {
|
||||
__m256i* b256 = (__m256i*)bb->get_submat(n, k, n_block_begin, 0);
|
||||
float* as = (float*)ba->get_scale(m, m_begin, k, 0);
|
||||
float* bs = (float*)bb->get_scale(n, n_block_begin, k, 0);
|
||||
|
||||
|
||||
__m512 sum = _mm512_setzero_ps();
|
||||
#define WORK_K_BLOCK(k_block) \
|
||||
{ \
|
||||
__m256 abscale0 = _mm256_set1_ps(as[(k_block)*2] * bs[(k_block)*2]); \
|
||||
__m256 abscale1 = _mm256_set1_ps(as[(k_block)*2+1] * bs[(k_block)*2+1]); \
|
||||
__m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
|
||||
__m512i mul = _mm512_setzero_si512(); \
|
||||
mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
|
||||
sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
|
||||
}
|
||||
#define WORK_K_BLOCK(k_block) \
|
||||
{ \
|
||||
__m256 abscale0 = _mm256_set1_ps(as[(k_block) * 2] * bs[(k_block) * 2]); \
|
||||
__m256 abscale1 = _mm256_set1_ps(as[(k_block) * 2 + 1] * bs[(k_block) * 2 + 1]); \
|
||||
__m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
|
||||
__m512i mul = _mm512_setzero_si512(); \
|
||||
mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
|
||||
sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
|
||||
}
|
||||
|
||||
for (int k_block = 0; k_block < k / 64; k_block += 2) {
|
||||
WORK_K_BLOCK(k_block);
|
||||
@@ -2935,13 +2933,15 @@ struct GemmKernel224Int4SmallKGroup {
|
||||
}
|
||||
};
|
||||
|
||||
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
|
||||
inline void vec_mul_kgroup(int m, int n, int k, int k_group_size,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
|
||||
GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
|
||||
}
|
||||
|
||||
inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
|
||||
inline void mat_mul_kgroup(int m, int n, int k, int k_group_size,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
|
||||
std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
|
||||
GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
|
||||
|
||||
@@ -211,26 +211,44 @@ inline __m256i merge_q8K_bsum(block_q8_K* b) {
|
||||
return _mm256_madd_epi16(_mm256_loadu_si256((__m256i*)b->bsums), _mm256_set1_epi16(1));
|
||||
}
|
||||
|
||||
inline __m512i _mm512_dpbusd_epi32_compat(__m512i src, __m512i a, __m512i b) {
|
||||
#if defined(__AVX512VNNI__)
|
||||
return _mm512_dpbusd_epi32(src, a, b);
|
||||
#else
|
||||
const __m512i mask_lo = _mm512_set1_epi16(0x00FF);
|
||||
const __m512i ones16 = _mm512_set1_epi16(1);
|
||||
|
||||
__m512i a_even = _mm512_and_si512(a, mask_lo);
|
||||
__m512i b_even = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);
|
||||
|
||||
__m512i a_odd = _mm512_srli_epi16(a, 8);
|
||||
__m512i b_odd = _mm512_srai_epi16(b, 8);
|
||||
|
||||
__m512i prod_even = _mm512_mullo_epi16(a_even, b_even);
|
||||
__m512i prod_odd = _mm512_mullo_epi16(a_odd, b_odd);
|
||||
|
||||
__m512i sum_even = _mm512_madd_epi16(prod_even, ones16);
|
||||
__m512i sum_odd = _mm512_madd_epi16(prod_odd, ones16);
|
||||
|
||||
return _mm512_add_epi32(src, _mm512_add_epi32(sum_even, sum_odd));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __m512i _mm512_dpbssd_epi32(__m512i src, __m512i a, __m512i b) {
|
||||
// 提取高低256-bit部分
|
||||
__m256i a_lo = _mm512_extracti64x4_epi64(a, 0);
|
||||
__m256i a_hi = _mm512_extracti64x4_epi64(a, 1);
|
||||
__m256i b_lo = _mm512_extracti64x4_epi64(b, 0);
|
||||
__m256i b_hi = _mm512_extracti64x4_epi64(b, 1);
|
||||
|
||||
// 根据a的符号调整b的符号
|
||||
b_lo = _mm256_sign_epi8(b_lo, a_lo);
|
||||
b_hi = _mm256_sign_epi8(b_hi, a_hi);
|
||||
|
||||
// 将修改后的b重新组合
|
||||
b = _mm512_inserti64x4(b, b_lo, 0);
|
||||
b = _mm512_inserti64x4(b, b_hi, 1);
|
||||
|
||||
// 取绝对值
|
||||
a = _mm512_abs_epi8(a);
|
||||
|
||||
// 进行dot-product计算
|
||||
return _mm512_dpbusd_epi32(src, a, b);
|
||||
return _mm512_dpbusd_epi32_compat(src, a, b);
|
||||
}
|
||||
|
||||
} // namespace amx
|
||||
|
||||
@@ -9,10 +9,42 @@ static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
|
||||
_mm512_storeu_si512(dst, _mm512_loadu_si512(src));
|
||||
}
|
||||
|
||||
// FP32 to BF16 conversion (32 floats -> 32 bf16)
|
||||
// This requires AVX512BF16 for the fast path, with a fallback for CPUs without it
|
||||
static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
|
||||
#if defined(HAVE_AVX512BF16) || defined(__AVX512BF16__)
|
||||
// Fast path: use native AVX512BF16 instruction
|
||||
_mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
|
||||
#else
|
||||
// Fallback: manual BF16 conversion using bit manipulation
|
||||
// BF16 is the upper 16 bits of FP32 (with rounding)
|
||||
__m512i i0 = _mm512_castps_si512(*src0);
|
||||
__m512i i1 = _mm512_castps_si512(*src1);
|
||||
|
||||
// Round to nearest even: add 0x7FFF + ((val >> 16) & 1)
|
||||
__m512i round0 =
|
||||
_mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i0, 16), _mm512_set1_epi32(1)));
|
||||
__m512i round1 =
|
||||
_mm512_add_epi32(_mm512_set1_epi32(0x7FFF), _mm512_and_epi32(_mm512_srli_epi32(i1, 16), _mm512_set1_epi32(1)));
|
||||
|
||||
i0 = _mm512_add_epi32(i0, round0);
|
||||
i1 = _mm512_add_epi32(i1, round1);
|
||||
|
||||
// Extract upper 16 bits (BF16)
|
||||
i0 = _mm512_srli_epi32(i0, 16);
|
||||
i1 = _mm512_srli_epi32(i1, 16);
|
||||
|
||||
// Pack 32-bit values to 16-bit
|
||||
__m512i result = _mm512_packus_epi32(i0, i1);
|
||||
// Fix the interleaving from packus
|
||||
result = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), result);
|
||||
|
||||
_mm512_storeu_si512(dst, result);
|
||||
#endif
|
||||
}
|
||||
|
||||
// BF16 to FP32 conversion (32 bf16 -> 32 floats)
|
||||
// This does NOT require AVX512BF16 - uses basic AVX512 bit manipulation
|
||||
static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
|
||||
_mm512_storeu_ps(dst0, _mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)(src))), 16)));
|
||||
|
||||
@@ -194,7 +194,7 @@ class CMakeBuild(build_ext):
|
||||
info["raw"]["flags"] = flags
|
||||
|
||||
# feature summary
|
||||
if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl", "avx512vnni"]):
|
||||
if any(f in flags or f in low for f in ["avx512f", "avx512bw", "avx512dq", "avx512vl"]):
|
||||
info["features"].add("AVX512")
|
||||
if "avx2" in flags or "avx2" in low:
|
||||
info["features"].add("AVX2")
|
||||
@@ -205,6 +205,16 @@ class CMakeBuild(build_ext):
|
||||
):
|
||||
info["features"].add("AMX")
|
||||
|
||||
# Fine-grained AVX512 subset detection
|
||||
if any(f in flags for f in ["avx512_vnni", "avx512vnni"]):
|
||||
info["features"].add("AVX512_VNNI")
|
||||
if any(f in flags for f in ["avx512_bf16", "avx512bf16"]):
|
||||
info["features"].add("AVX512_BF16")
|
||||
if any(f in flags for f in ["avx512_vbmi", "avx512vbmi"]):
|
||||
info["features"].add("AVX512_VBMI")
|
||||
if any(f in flags for f in ["avx512_vpopcntdq", "avx512vpopcntdq"]):
|
||||
info["features"].add("AVX512_VPOPCNTDQ")
|
||||
|
||||
elif sysname == "Darwin":
|
||||
# macOS: Apple Silicon (arm64) vs Intel
|
||||
arch = platform.machine().lower()
|
||||
@@ -229,133 +239,6 @@ class CMakeBuild(build_ext):
|
||||
return info
|
||||
|
||||
def build_extension(self, ext: CMakeExtension):
|
||||
"""
|
||||
Main entry point for building the extension.
|
||||
|
||||
Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
|
||||
and routes to the appropriate build method.
|
||||
"""
|
||||
if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
|
||||
# Build all 3 variants (AMX, AVX512, AVX2)
|
||||
self.build_multi_variants(ext)
|
||||
else:
|
||||
# Build single variant (original behavior)
|
||||
self._build_single_variant(ext)
|
||||
|
||||
def build_multi_variants(self, ext: CMakeExtension):
|
||||
"""
|
||||
Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel.
|
||||
|
||||
This method is called when CPUINFER_BUILD_ALL_VARIANTS=1 is set.
|
||||
It builds three separate extensions with different CPU instruction sets
|
||||
and renames the output .so files with variant suffixes.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)")
|
||||
print("=" * 80)
|
||||
|
||||
# Define the 3 variants to build
|
||||
variants = [
|
||||
{
|
||||
'name': 'amx',
|
||||
'env': {
|
||||
'CPUINFER_CPU_INSTRUCT': 'NATIVE',
|
||||
'CPUINFER_ENABLE_AMX': 'ON',
|
||||
},
|
||||
'description': 'AMX variant (Intel Sapphire Rapids+)'
|
||||
},
|
||||
{
|
||||
'name': 'avx512',
|
||||
'env': {
|
||||
'CPUINFER_CPU_INSTRUCT': 'AVX512',
|
||||
'CPUINFER_ENABLE_AMX': 'OFF',
|
||||
},
|
||||
'description': 'AVX512 variant (Intel Skylake-X/Ice Lake/Cascade Lake)'
|
||||
},
|
||||
{
|
||||
'name': 'avx2',
|
||||
'env': {
|
||||
'CPUINFER_CPU_INSTRUCT': 'AVX2',
|
||||
'CPUINFER_ENABLE_AMX': 'OFF',
|
||||
},
|
||||
'description': 'AVX2 variant (maximum compatibility)'
|
||||
}
|
||||
]
|
||||
|
||||
# Save original environment
|
||||
original_env = os.environ.copy()
|
||||
|
||||
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
|
||||
|
||||
for i, variant in enumerate(variants, 1):
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Building variant {i}/3: {variant['description']}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
# Set variant-specific environment variables
|
||||
os.environ.update(variant['env'])
|
||||
|
||||
# Use a unique build directory for this variant
|
||||
original_build_temp = self.build_temp
|
||||
self.build_temp = str(Path(self.build_temp) / f"variant_{variant['name']}")
|
||||
|
||||
try:
|
||||
# Build this variant (calls the single-variant build logic)
|
||||
self._build_single_variant(ext)
|
||||
|
||||
# Rename the generated .so file to include variant suffix
|
||||
# Original: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
|
||||
# Renamed: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
|
||||
|
||||
# Extract the base extension name (without package prefix)
|
||||
# ext.name is "kt_kernel.kt_kernel_ext", we want "kt_kernel_ext"
|
||||
base_ext_name = ext.name.split('.')[-1]
|
||||
|
||||
# Find the newly built .so file
|
||||
import time
|
||||
time.sleep(0.5) # Give filesystem time to sync
|
||||
|
||||
built_candidates = [
|
||||
f for f in Path(extdir).glob("*.so")
|
||||
if f.name.startswith(base_ext_name) and not f.name.startswith(f"_{base_ext_name}_")
|
||||
]
|
||||
|
||||
if not built_candidates:
|
||||
print(f"WARNING: No .so file found for {base_ext_name} in {extdir}")
|
||||
print(f"Files in {extdir}:")
|
||||
for f in Path(extdir).glob("*.so"):
|
||||
print(f" {f.name}")
|
||||
|
||||
for so_file in built_candidates:
|
||||
# Extract the python tag part (e.g., ".cpython-311-x86_64-linux-gnu.so")
|
||||
suffix = so_file.name.replace(base_ext_name, "")
|
||||
new_name = f"_{base_ext_name}_{variant['name']}{suffix}"
|
||||
new_path = extdir / new_name
|
||||
|
||||
print(f"-- Renaming {so_file.name} -> {new_name}")
|
||||
if new_path.exists():
|
||||
print(f" WARNING: Target file already exists, removing: {new_path}")
|
||||
new_path.unlink()
|
||||
so_file.rename(new_path)
|
||||
print(f" ✓ Successfully renamed to {new_name}")
|
||||
|
||||
finally:
|
||||
# Restore build_temp for next iteration
|
||||
self.build_temp = original_build_temp
|
||||
|
||||
# Restore original environment
|
||||
os.environ.clear()
|
||||
os.environ.update(original_env)
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print("✓ Successfully built all 3 CPU variants")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
def _build_single_variant(self, ext: CMakeExtension):
|
||||
"""
|
||||
Build a single CPU variant. This contains the core build logic
|
||||
extracted from the original build_extension method.
|
||||
"""
|
||||
# Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
|
||||
def detect_cuda_toolkit() -> bool:
|
||||
# Respect CUDA_HOME
|
||||
@@ -403,10 +286,6 @@ class CMakeBuild(build_ext):
|
||||
auto_cuda = detect_cuda_toolkit()
|
||||
os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
|
||||
print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
|
||||
elif cuda_env:
|
||||
print("-- CPUINFER_USE_CUDA explicitly enabled")
|
||||
else:
|
||||
print("-- CPUINFER_USE_CUDA explicitly disabled")
|
||||
|
||||
extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
|
||||
cfg = default_build_type()
|
||||
@@ -461,6 +340,17 @@ class CMakeBuild(build_ext):
|
||||
else:
|
||||
print(f"-- CPUINFER_CPU_INSTRUCT={cpu_mode}; not auto-enabling AMX/AVX512 umbrella")
|
||||
|
||||
# Fine-grained AVX512 subset flags: only enable if CPU actually supports them
|
||||
# These are passed to CMake to conditionally add compiler flags
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_VNNI", "LLAMA_AVX512_VNNI"):
|
||||
if "AVX512_VNNI" in d["features"]:
|
||||
cmake_args.append("-DLLAMA_AVX512_VNNI=ON")
|
||||
print("-- AVX512_VNNI detected; enabling (-DLLAMA_AVX512_VNNI=ON)")
|
||||
if not _forward_bool_env(cmake_args, "CPUINFER_ENABLE_AVX512_BF16", "LLAMA_AVX512_BF16"):
|
||||
if "AVX512_BF16" in d["features"]:
|
||||
cmake_args.append("-DLLAMA_AVX512_BF16=ON")
|
||||
print("-- AVX512_BF16 detected; enabling (-DLLAMA_AVX512_BF16=ON)")
|
||||
|
||||
# Auto-enable MOE kernel only when env explicitly turns on AMD or KML backend
|
||||
# (Do not enable purely on vendor auto-detection to avoid surprise behavior.)
|
||||
amd_env = _env_get_bool("CPUINFER_ENABLE_BLIS", None)
|
||||
@@ -562,6 +452,7 @@ class CMakeBuild(build_ext):
|
||||
# Version (simple). If you later add a python package dir, you can read from it.
|
||||
################################################################################
|
||||
|
||||
|
||||
# Import version from shared version.py at project root
|
||||
_version_file = Path(__file__).resolve().parent.parent / "version.py"
|
||||
if _version_file.exists():
|
||||
|
||||
Reference in New Issue
Block a user