Fix attn split

Granularity for Wq, Wo is not just head size, but head size * gqa_ratio. Else the Wk, Wv tensors end up not being a multiple of the head size when we divide the split determined by Wo with the gqa_ratio.
2026-04-29 19:01:47 +00:00 · 2025-11-27 05:03:52 +00:00
parent 52a7cbe482
commit 7d84dca29e
1 changed files with 7 additions and 5 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -139,7 +139,7 @@ struct create_tensors_helper : public create_tensors_helper_interface {
            ggml_context ** actual_ctx = nullptr);

    void create_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias);
-    void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true);
+    void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true, bool use_ctx_split = false);

    void create_std_attn(int i, const LLM_TN & tn, llama_layer & layer, int n_embd, int n_embd_gqa, ggml_context * ctx_split);
    void create_std_ffn(int i, const LLM_TN & tn, llama_layer & layer, int n_ff, int n_embd, ggml_context * ctx_split);
@@ -360,17 +360,18 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
        bool use_mmap_buffer = true;


-void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm) {
+void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm, bool use_ctx_split) {
    model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});

    if (has_norm) {
    model.output_norm = create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
    }
-    model.output      = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+    auto out_ctx = use_ctx_split ? ctx_output_split : ctx_output;
+    model.output      = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);

    // if output is NULL, init from the input tok embed
    if (model.output == NULL) {
-        model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+        model.output = create_tensor(out_ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
    }
 }

@@ -389,7 +390,7 @@ void create_tensors_helper::create_std_ffn(int i, const LLM_TN & tn, llama_layer

 bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) {
    LOADING_PRELUDE
-    create_embd_output(tn, n_embd, n_vocab);
+    create_embd_output(tn, n_embd, n_vocab, true, true);

    for (int i = 0; i < n_layer; ++i) {
        ggml_context * ctx_layer = ctx_for_layer(i);
@@ -2934,6 +2935,7 @@ bool create_tensors_helper::create_tensors() {
                    auto tt = ggml_internal_get_type_traits(layer.wo->type);
                    if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
                }
+                attn_granularity *= gqa_ratio;
                GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
                auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits);
                prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split);