From 1fdbc0dafed3d3e6fae0adb14e08f262d7f412b8 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 9 Feb 2026 16:20:16 +0100
Subject: [PATCH] Fix #1222 (#1257)

* Fix #1222

* Typo
---
 src/llama-build-context.cpp | 102 +++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 55 deletions(-)

diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index d2c64475..eb6d3894 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1891,7 +1891,8 @@ ggml_cgraph * llm_build_context::build_llama() {
 
         // self-attention
         if (use_rope) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_layer - 1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
                     this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il, true, false, true);
         }
         else {
@@ -3560,8 +3561,9 @@ ggml_cgraph * llm_build_context::build_seedoss() {
 
     for (int il = 0; il < n_layer; ++il) {
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer-1 ? inp_out_ids : nullptr, nullptr,
-                    KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer-1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
+                KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
 
         cur = llm_build_ffn(ctx0, lctx, model.layers[il].attn_post_norm, cur,
                 model.layers[il].ffn_up,   NULL, NULL,
@@ -3608,7 +3610,8 @@ ggml_cgraph * llm_build_context::build_step35() {
         }
         auto rope_freqs = layer.rope_freqs;
         layer.rope_freqs = nullptr;
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 && n_tokens > 1 ? inp_out_ids : nullptr,
                 rope_factors, is_swa ? KQ_mask_swa : KQ_mask, nullptr, nullptr, kq_scale, 0.0f, is_swa ? hparams.n_swa : 0,
                 il, true, false, true);
         layer.rope_freqs = rope_freqs;
@@ -4112,7 +4115,8 @@ ggml_cgraph * llm_build_context::build_qwen3() {
         struct ggml_tensor * inpSA = inpL;
 
         if (!rope_cache) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer-1 ? inp_out_ids : nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_layer-1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
                     KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);
         } else {
 
@@ -4281,17 +4285,14 @@ ggml_cgraph * llm_build_context::build_qwen3vl() {
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
+    auto inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
+
     for (int il = 0; il < n_layer; ++il) {
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask,
                 nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true, false, true);
 
-        if (il == n_layer - 1 && n_tokens > 1) {
-            // skip computing output for unused tokens
-            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
         // feed-forward network
         cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
                 model.layers[il].ffn_up,   NULL, NULL,
@@ -7033,7 +7034,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
 
         // self-attention
         if (rope_cache == nullptr) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_transformer_layers - 1 ? inp_out_ids : nullptr, nullptr,
                     KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
         } else {
             // Pre-attention norm
@@ -7068,16 +7070,17 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
                     Kcur, Vcur, Qcur, KQ_mask,
                     n_tokens, kv_head, n_kv,
                     1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+            if (il == n_transformer_layers - 1 && inp_out_ids) {
+                // skip computing output for unused tokens
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                if (rope_cache) {
+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                }
+            }
         }
 
         // crop output on last layer
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
-            // skip computing output for unused tokens
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            if (rope_cache) {
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-        }
 
         // residual connection for attention output
         ggml_tensor * ffn_inp;
@@ -8371,12 +8374,9 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
     GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
     for (int il = 0; il < n_layer; ++il) {
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask, nullptr, nullptr,
-                    1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-        }
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr,
+                1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);
 
         // feed-forward network
         bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
@@ -8443,13 +8443,10 @@ ggml_cgraph * llm_build_context::build_hunyuan_moe() {
 
     for (int il = 0; il < n_layer; ++il) {
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask,
                 nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
 
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
         cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
                 model.layers[il].ffn_gate_inp,  nullptr,
                 model.layers[il].ffn_up_exps,   nullptr,
@@ -8504,15 +8501,11 @@ ggml_cgraph * llm_build_context::build_mimo2() {
         const bool is_sliding = model.hparams.swa_layers[il];
         auto KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
                 KQ_mask_l, model.layers[il].attn_sinks,
                 nullptr, 1.0f/sqrtf(float(n_embd_head_k)), 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);
 
-        if (il == n_layer - 1 && inp_out_ids) {
-            // skip computing output for unused tokens
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
         auto ffn_inp = cur;
 
         if (model.layers[il].ffn_gate_inp == nullptr) {
@@ -8581,13 +8574,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
 
         struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
 
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
-                KQ_mask_l, model.layers[il].attn_sinks, nullptr, kq_scale, 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);
-
-        //if (il == n_layer - 1 && inp_out_ids) {
-        //    // skip computing output for unused tokens
-        //    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        //}
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask_l,
+                model.layers[il].attn_sinks, nullptr, kq_scale, 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);
 
         bool use_dup_bias = cur->ne[1] < 32 && model.layers[il].ffn_up_exps_b_dup &&
                                                model.layers[il].ffn_gate_exps_b_dup &&
@@ -9541,7 +9530,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
             }
             std::vector<ggml_tensor*> attn(wq->n_device, nullptr);
             bool output_bias_added = false;
-            bool input_added = false;
+            int last_id = -1;
             for (int id = 0; id < wq->n_device; ++id) {
                 int il_cb = 1000*(id+1) + il;
                 auto split_wq = wq->splits[id];
@@ -9554,7 +9543,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                         (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl));
                 if (!split_wq) continue;
                 auto cur = get_input_tensor_sm_graph(ctx0, input, id);
-                auto input_id = cur;
                 cur = do_split_norm(ctx0, cur, the_attn_norm, lctx.model.hparams, cb, id, il_cb, is_norm);
                 auto input_normed = cur;
                 auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
@@ -9693,11 +9681,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
                 cb(cur, "flash_attn_reshaped", il_cb);
 
-                if (inp_out_ids) { // && ggml_nrows(inp_out_ids) > 1) {
+                if (inp_out_ids) {
                     cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-                    if (add_input && !input_added) {
-                        input_id = ggml_get_rows(ctx0, input_id, inp_out_ids);
-                    }
                     cb(cur, "fa_get_rows", il_cb);
                 }
 
@@ -9712,16 +9697,23 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                     cb(cur, "kqv_wo_biased", il_cb);
                     output_bias_added = true;
                 }
-                if (add_input && !input_added) {
-                    cur = ggml_add(ctx0, cur, input_id);
-                    input_added = true;
-                }
                 if (cur->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
                     cur = ggml_cast(ctx0, cur, lctx.cparams.reduce_type);
                 }
                 ggml_build_forward_expand(gf, cur);
                 attn[id] = cur;
+                last_id = id;
             }
+            GGML_ASSERT(last_id >= 0);
+            if (add_input) {
+                if (inp_out_ids) {
+                    input = ggml_get_rows(ctx0, input, inp_out_ids);
+                    cb(input, "sainp_get_rows", il);
+                }
+                attn[last_id] = ggml_add(ctx0, attn[last_id], input);
+                cb(attn[last_id], "attn_out_with_input", il);
+            }
+
             auto cur = ggml_reduce(ctx0, attn.data(), wq->n_device, GGML_OP_ADD);
             ggml_build_forward_expand(gf, cur);
             cb(cur, "attn_combined", il);
@@ -9792,7 +9784,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
     }
 
-    if (inp_out_ids) { // && ggml_nrows(inp_out_ids) > 1) {
+    if (inp_out_ids) {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
         cb(cur, "sa_get_rows", il);
         if (add_input) {