From ddcdf25e5432be47b16362b569d465210b8f6e7d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 23 Apr 2025 14:12:08 +0300 Subject: [PATCH] Use mul_mat_qX_0_q8_2_Tx for q6_0 in FA --- ggml/src/iqk/iqk_mul_mat.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index a03b6429..b2f11b2a 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -16841,6 +16841,9 @@ struct FlashQKfp32 { #ifdef __aarch64__ MAKE_FUNCS(mul_mat_qX_0_q8_0, 1); + if (nq == 2) return std::make_pair(mul_mat_qX_0_q8_2_Tx, 2); + if (nq == 4) return std::make_pair(mul_mat_qX_0_q8_2_Tx, 4); MAKE_FUNCS(mul_mat_qX_1_q8_2_T> || std::is_same_v> || std::is_same_v>) { - constexpr size_t kMaxOnStackSize = 18432; //576; + constexpr size_t kMaxOnStackSize = 576; auto q_size = q_step*(Dk/KHelper::block_size_q)*sizeof(typename KHelper::block_q8); q_size = GGML_PAD(q_size, 64); if (q_size > kMaxOnStackSize) {