From e0eebfd8adc7d03b2d14723a6eead9b36d6ee4af Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 10 Mar 2025 19:07:53 +0200 Subject: [PATCH] Try using fp32 for FlashMLA --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ba5c5052..c93887d0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13677,9 +13677,9 @@ struct llm_build_context { ggml_build_forward_expand(gf, q); kqv = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask, kq_scale, hparams.f_max_alibi_bias, 0.f); - if (q->ne[1] <= 8) { + //if (q->ne[1] <= 8) { ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32); - } + //} cb(kqv, "kqv", il); cur = ggml_reshape_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens);