From ed31b6741f3d60a4f97603436328a8044b76b72f Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 29 Oct 2025 15:44:24 +0200 Subject: [PATCH] merge_qkv: llama-4 --- src/llama-build-context.cpp | 2 +- src/llama-load-tensors.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 90844b71..ef373337 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1373,7 +1373,7 @@ ggml_cgraph * llm_build_context::build_llama() { n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (inp_attn_scale) { - Qcur = ggml_mul(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_attn_scale); + Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); } cb(Qcur, "Qcur", il); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 8bdad583..743f289e 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -416,9 +416,8 @@ bool create_tensors_helper::create_llama4_tensors(const LLM_TN & tn) { layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + use_mmap_buffer &= !merge_qkv(tn, i, 0); + layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);