From b3036a872f474beadf2df72d452ca7016db72aac Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 17 May 2025 11:21:58 +0300 Subject: [PATCH] Option to enable disable the IQK CPU FA kernels (#429) Co-authored-by: Iwan Kawrakow --- ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 12 +++++++++--- ggml/src/iqk/iqk_flash_attn.cpp | 2 +- ggml/src/iqk/iqk_mul_mat.cpp | 2 ++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 70e3bbf3..314a38fb 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -131,6 +131,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) +option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON) option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF) option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 4f4337c2..14650d03 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -260,9 +260,15 @@ if (GGML_IQK_MUL_MAT) add_compile_definitions(GGML_USE_IQK_MULMAT) set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp) set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h) - if (GGML_IQK_FA_ALL_QUANTS) - message(STATUS "Including all IQK FA kernels") - add_compile_definitions(GGML_IQK_FA_ALL_QUANTS) + if (GGML_IQK_FLASH_ATTENTION) + message(STATUS "Enabling IQK Flash Attention kernels") + add_compile_definitions(GGML_IQK_FLASH_ATTENTION) + if (GGML_IQK_FA_ALL_QUANTS) + message(STATUS "Including all IQK FA kernels") + add_compile_definitions(GGML_IQK_FA_ALL_QUANTS) + endif() + else() + message(STATUS "Disabling IQK Flash Attention kernels") endif() endif() diff --git a/ggml/src/iqk/iqk_flash_attn.cpp b/ggml/src/iqk/iqk_flash_attn.cpp index 610f18b7..9a974ae7 100644 --- a/ggml/src/iqk/iqk_flash_attn.cpp +++ b/ggml/src/iqk/iqk_flash_attn.cpp @@ -8,7 +8,7 @@ #include "iqk_mul_mat.h" #include "iqk_flash_impl.h" -#ifdef IQK_IMPLEMENT +#if defined IQK_IMPLEMENT && defined GGML_IQK_FLASH_ATTENTION #include #include diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 654cc706..311554f4 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -15875,6 +15875,7 @@ void MulMat::relu(int n, const float * x, float * y) { #endif } // namespace +#ifdef GGML_IQK_FLASH_ATTENTION namespace { template @@ -18663,6 +18664,7 @@ bool iqk_flash_attn_impl(int int_type_k, // type of k return true; } +#endif #else // IQK_IMPLEMENT