From 17e810142f67ff3a9a9b8f2b2755a65675f23c2f Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 9 Feb 2025 11:13:13 +0200 Subject: [PATCH] Use type_k and type_v to set the types of the MLA caches They were hard-coded at f16. On my Ryzen-7950X with native bf16 support I get a fairly significant PP performance boost with bf16 KV-cache: PP-4096 = 320 t/s up from 292 t/s with fp16 KV-cache. --- src/llama.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 17d25733..00e6c934 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3119,8 +3119,10 @@ static bool llama_kv_cache_init( cache.size = kv_size; cache.used = 0; - cache.type_k = type_k; - cache.type_v = type_v; + cache.type_k = type_k; + cache.type_v = type_v; + cache.type_kr = type_k; + cache.type_kv = type_v; cache.cells.clear(); cache.cells.resize(kv_size); @@ -13545,7 +13547,7 @@ struct llm_build_context { kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); - if (!pp_opt) { + if (!pp_opt) { kq = ggml_permute(ctx0, kq, 0, 2, 1, 3); cb(kq, "kq_soft_max_ext_perm", il); }