DeepSeek TG optimizations for TG (#928)

* Fuse concat and copy into K cache * Avoid ggml_cont() when n_token = 1 Combined effect: about +2% in TG performance with full GPU offload Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-20 22:49:31 +00:00 · 2025-11-10 09:52:07 +02:00
parent 9dfbc69aee
commit 7747000f3b
4 changed files with 79 additions and 5 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -6268,11 +6268,13 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
                    kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
                    cb(kqv, "kqv", il);

-                    kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
-                    cb(kqv, "kqv_perm", il);
-
-                    cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0);
+                    if (n_tokens > 1) {
+                        kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
+                        cb(kqv, "kqv_perm", il);
+                    }
+                    cur = ggml_reshape_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens);
                    cb(cur, "kqv_2d", il);
+
                }

                ggml_build_forward_expand(gf, cur);