Fused Q and K fused_rms_norm for TG on CUDA (#882)

* Biased mmvq: minor optimization * Fusing Q and K rms_norm for TG on CUDA * Remove commented out code --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-12 23:10:01 +00:00 · 2025-10-31 14:41:28 +02:00
parent fd3757d4ee
commit 8c8a7fb7c8
5 changed files with 94 additions and 2 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1279,10 +1279,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
        if (q_norm) {
            Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Qcur, "Qcur_normed", il);
+            ggml_build_forward_expand(gf, Qcur);
        }
        if (k_norm) {
            Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
            cb(Kcur, "Kcur_normed", il);
+            ggml_build_forward_expand(gf, Kcur);
        }

        return {Qcur, Kcur, Vcur};
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -2451,7 +2451,6 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias) {
        layer.wk = ml.create_tensor_as_view(ctx_split, layer.wqkv, wk_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
        layer.wv = ml.create_tensor_as_view(ctx_split, layer.wqkv, wv_name.c_str(), { wv->ne[0], wv->ne[1] }, wq->ne[1]*wq->nb[1] + wk->ne[1]*wk->nb[1] );
        fused_qkv = true;
-        printf("================================== Created merged qkv %s\n", layer.wqkv->name);
        if (bias) {
            auto bq_name = tn(LLM_TENSOR_ATTN_Q, "bias", i);
            auto bk_name = tn(LLM_TENSOR_ATTN_K, "bias", i);