From eb9882407fabf05eccbc659e2a30d7b55b10d7c8 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 30 Nov 2025 16:12:20 +0000 Subject: [PATCH] Better --- src/llama-build-context.cpp | 6 +++++- src/llama-load-tensors.cpp | 10 ++++++++-- src/llama-model.h | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 3f397a19..28b92ea1 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -9220,11 +9220,15 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens else if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); } + auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr; + auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ? + ((ggml_split_tensor_t *)model.layers[il].attn_k_norm->extra)->splits[id] : model.layers[il].attn_k_norm : nullptr; auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, nullptr, nullptr, nullptr, nullptr, split_wq, bq ? bq->splits[id] : nullptr, split_wk, bk ? bk->splits[id] : nullptr, split_wv, bv ? bv->splits[id] : nullptr, - model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il_cb); + the_q_norm, the_k_norm, f_attn_scale, il_cb); auto rope_factors = rope_factors_in; if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) { auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index e5951b5b..af1752b8 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1858,9 +1858,9 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); // K/Q norm tensors (optional for GLM-4.5 355B variant) - layer.attn_q_norm = create_tensor(ctx_layer, + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); - layer.attn_k_norm = create_tensor(ctx_layer, + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); // Why are we adding an additional tensor type? @@ -2945,6 +2945,9 @@ bool create_tensors_helper::create_tensors() { if (layer.bq) { prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used); } + if (layer.attn_q_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, split, mem_used); + } for (auto & s : split) s /= gqa_ratio; prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used); prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used); @@ -2954,6 +2957,9 @@ bool create_tensors_helper::create_tensors() { if (layer.bv) { prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used); } + if (layer.attn_k_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split, mem_used); + } } if (layer.ffn_norm) { diff --git a/src/llama-model.h b/src/llama-model.h index 99ec2d42..c7fe8b68 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -198,6 +198,8 @@ struct llama_layer { llama_split_tensor split_bqkv; llama_split_tensor split_bqk; llama_split_tensor split_bkv; + llama_split_tensor split_q_norm; + llama_split_tensor split_k_norm; // relative position bias struct ggml_tensor * attn_rel_b = nullptr;