merge_qkv: add command loine argument to enable

2026-04-26 17:39:37 +00:00 · 2025-10-29 13:27:55 +02:00
parent ca5cff8677
commit 4e8f371e76
8 changed files with 23 additions and 9 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -2441,7 +2441,7 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias) {
    GGML_ASSERT(wq && wk && wv);

    bool fused_qkv = false;
-    if (wq->type == wk->type && wq->type == wv->type && hparams.f_attention_scale == 0.0f) {
+    if (ml.merge_qkv && wq->type == wk->type && wq->type == wv->type && hparams.f_attention_scale == 0.0f) {
        GGML_ASSERT(wq->ne[0] == n_embd && wq->ne[1] == n_head * n_embd_head_k);
        GGML_ASSERT(wk->ne[0] == n_embd && wk->ne[1] == n_embd_gqa);
        GGML_ASSERT(wv->ne[0] == n_embd && wv->ne[1] == n_embd_gqa);
@@ -2454,7 +2454,7 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias) {
        layer.wk = ml.create_tensor_as_view(ctx_split, layer.wqkv, wk_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
        layer.wv = ml.create_tensor_as_view(ctx_split, layer.wqkv, wv_name.c_str(), { wv->ne[0], wv->ne[1] }, wq->ne[1]*wq->nb[1] + wk->ne[1]*wk->nb[1] );
        fused_qkv = true;
-        printf("Created fused qkv %s\n", layer.wqkv->name);
+        printf("Created merged qkv %s\n", layer.wqkv->name);
        if (bias) {
            auto bq_name = tn(LLM_TENSOR_ATTN_Q, "bias", i);
            auto bk_name = tn(LLM_TENSOR_ATTN_K, "bias", i);