From a2f3b08fbdb5257e2a920436900b2e7394794172 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 29 Oct 2025 15:59:38 +0200 Subject: [PATCH] merge_qkv: qwen3 (dense) --- src/llama-build-context.cpp | 27 +++++++++------------------ src/llama-load-tensors.cpp | 5 ++--- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index ef373337..1e688094 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -3379,30 +3379,21 @@ ggml_cgraph * llm_build_context::build_qwen3() { // self-attention { - auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wq, nullptr, + auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, + model.layers[il].wqkv, nullptr, + model.layers[il].wq, nullptr, model.layers[il].wk, nullptr, - model.layers[il].wv, nullptr, 0, il); + model.layers[il].wv, nullptr, + model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); - cb(Qcur, "Qcur_normed", il); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); - cb(Kcur, "Kcur_normed", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 743f289e..d2797911 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1015,9 +1015,8 @@ bool create_tensors_helper::create_qwen3_tensors(const LLM_TN & tn) { layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); - layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + use_mmap_buffer &= !merge_qkv(tn, i, 0); + layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});