DeepSeek TG optimizations for TG (#928)

* Fuse concat and copy into K cache
* Avoid ggml_cont() when n_token = 1

Combined effect: about +2% in TG performance with full GPU offload

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-11-10 09:52:07 +02:00
committed by GitHub
parent 9dfbc69aee
commit 7747000f3b
4 changed files with 79 additions and 5 deletions

View File

@@ -6268,11 +6268,13 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
cb(kqv, "kqv", il);
kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
cb(kqv, "kqv_perm", il);
cur = ggml_view_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens, ggml_row_size(kqv->type, n_embd_head_v*n_head), 0);
if (n_tokens > 1) {
kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
cb(kqv, "kqv_perm", il);
}
cur = ggml_reshape_2d(ctx0, kqv, n_embd_head_v*n_head, n_tokens);
cb(cur, "kqv_2d", il);
}
ggml_build_forward_expand(gf, cur);