Better CPU FA performance for DeepSeek-Lite (#410)

* Better CPU FA performance for DeepSeek-Lite

* It must be like this

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-05-13 17:53:20 +03:00
committed by GitHub
parent 537f72f9cc
commit 86dbdea6fc

View File

@@ -17242,7 +17242,7 @@ struct FlashAttn {
q_size = GGML_PAD(q_size, 64);
if (q_size > kMaxOnStackSize) {
auto qptr = get_q_storage(q_size);
if (nq1 >= 8) {
if (false && nq1 >= 8) {
if constexpr (std::is_same_v<KHelper, HelperQ80<Dk, k_step>>) {
#if FA_TIMING
auto t1 = Perf::cur_time();
@@ -17929,6 +17929,12 @@ inline void iqk_deepseek_helper(KHelper& kh, VHelper& vh,
if (M && S) { M += n; S += n; }
return false;
};
if (nq1 >= 16) {
int n_step = nq1/16;
FlashAttn<576, 512, 16, step_k> fa(scale, softcap);
fa.compute(kh, vh, 16*n_step, nk1, stride_q, stride_m, stride_qkv, q, mask, qkv, M, S);
if (update(16*n_step)) return;
}
if (nq1 >= 8) {
int n_step = nq1/8;
FlashAttn<576, 512, 8, step_k> fa(scale, softcap);