Fix #1222 (#1257)

* Fix #1222 * Typo
2026-02-20 13:14:09 +00:00 · 2026-02-09 16:20:16 +01:00
parent 494d70626f
commit 1fdbc0dafe
1 changed files with 47 additions and 55 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1891,7 +1891,8 @@ ggml_cgraph * llm_build_context::build_llama() {

        // self-attention
        if (use_rope) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_layer - 1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
                    this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il, true, false, true);
        }
        else {
@@ -3560,8 +3561,9 @@ ggml_cgraph * llm_build_context::build_seedoss() {

    for (int il = 0; il < n_layer; ++il) {

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer-1 ? inp_out_ids : nullptr, nullptr,
-                    KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer-1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
+                KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);

        cur = llm_build_ffn(ctx0, lctx, model.layers[il].attn_post_norm, cur,
                model.layers[il].ffn_up,   NULL, NULL,
@@ -3608,7 +3610,8 @@ ggml_cgraph * llm_build_context::build_step35() {
        }
        auto rope_freqs = layer.rope_freqs;
        layer.rope_freqs = nullptr;
-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 && n_tokens > 1 ? inp_out_ids : nullptr,
                rope_factors, is_swa ? KQ_mask_swa : KQ_mask, nullptr, nullptr, kq_scale, 0.0f, is_swa ? hparams.n_swa : 0,
                il, true, false, true);
        layer.rope_freqs = rope_freqs;
@@ -4112,7 +4115,8 @@ ggml_cgraph * llm_build_context::build_qwen3() {
        struct ggml_tensor * inpSA = inpL;

        if (!rope_cache) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer-1 ? inp_out_ids : nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_layer-1 && n_tokens > 1 ? inp_out_ids : nullptr, nullptr,
                    KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);
        } else {

@@ -4281,17 +4285,14 @@ ggml_cgraph * llm_build_context::build_qwen3vl() {
    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
    struct ggml_tensor * KQ_mask = build_inp_KQ_mask();

+    auto inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
+
    for (int il = 0; il < n_layer; ++il) {

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask,
                nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true, false, true);

-        if (il == n_layer - 1 && n_tokens > 1) {
-            // skip computing output for unused tokens
-            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
        // feed-forward network
        cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
                model.layers[il].ffn_up,   NULL, NULL,
@@ -7033,7 +7034,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {

        // self-attention
        if (rope_cache == nullptr) {
-            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr,
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                    inp_pos, il == n_transformer_layers - 1 ? inp_out_ids : nullptr, nullptr,
                    KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);
        } else {
            // Pre-attention norm
@@ -7068,16 +7070,17 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
                    Kcur, Vcur, Qcur, KQ_mask,
                    n_tokens, kv_head, n_kv,
                    1.0f/sqrtf(float(n_embd_head)), cb, il);
+
+            if (il == n_transformer_layers - 1 && inp_out_ids) {
+                // skip computing output for unused tokens
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                if (rope_cache) {
+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                }
+            }
        }

        // crop output on last layer
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
-            // skip computing output for unused tokens
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            if (rope_cache) {
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-        }

        // residual connection for attention output
        ggml_tensor * ffn_inp;
@@ -8371,12 +8374,9 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
    GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
    for (int il = 0; il < n_layer; ++il) {

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask, nullptr, nullptr,
-                    1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-        }
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr,
+                1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il, true, false, true);

        // feed-forward network
        bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
@@ -8443,13 +8443,10 @@ ggml_cgraph * llm_build_context::build_hunyuan_moe() {

    for (int il = 0; il < n_layer; ++il) {

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr, KQ_mask,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask,
                nullptr, nullptr, kq_scale, 0.0f, 0, il, true, false, true);

-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
        cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
                model.layers[il].ffn_gate_inp,  nullptr,
                model.layers[il].ffn_up_exps,   nullptr,
@@ -8504,15 +8501,11 @@ ggml_cgraph * llm_build_context::build_mimo2() {
        const bool is_sliding = model.hparams.swa_layers[il];
        auto KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, nullptr,
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
                KQ_mask_l, model.layers[il].attn_sinks,
                nullptr, 1.0f/sqrtf(float(n_embd_head_k)), 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);

-        if (il == n_layer - 1 && inp_out_ids) {
-            // skip computing output for unused tokens
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        }
-
        auto ffn_inp = cur;

        if (model.layers[il].ffn_gate_inp == nullptr) {
@@ -8581,13 +8574,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {

        struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;

-        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
-                KQ_mask_l, model.layers[il].attn_sinks, nullptr, kq_scale, 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);
-
-        //if (il == n_layer - 1 && inp_out_ids) {
-        //    // skip computing output for unused tokens
-        //    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-        //}
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
+                inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask_l,
+                model.layers[il].attn_sinks, nullptr, kq_scale, 0.0f, is_sliding ? hparams.n_swa : 0, il, true, false, true);

        bool use_dup_bias = cur->ne[1] < 32 && model.layers[il].ffn_up_exps_b_dup &&
                                               model.layers[il].ffn_gate_exps_b_dup &&
@@ -9541,7 +9530,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
            }
            std::vector<ggml_tensor*> attn(wq->n_device, nullptr);
            bool output_bias_added = false;
-            bool input_added = false;
+            int last_id = -1;
            for (int id = 0; id < wq->n_device; ++id) {
                int il_cb = 1000*(id+1) + il;
                auto split_wq = wq->splits[id];
@@ -9554,7 +9543,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                        (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl));
                if (!split_wq) continue;
                auto cur = get_input_tensor_sm_graph(ctx0, input, id);
-                auto input_id = cur;
                cur = do_split_norm(ctx0, cur, the_attn_norm, lctx.model.hparams, cb, id, il_cb, is_norm);
                auto input_normed = cur;
                auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
@@ -9693,11 +9681,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
                cb(cur, "flash_attn_reshaped", il_cb);

-                if (inp_out_ids) { // && ggml_nrows(inp_out_ids) > 1) {
+                if (inp_out_ids) {
                    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-                    if (add_input && !input_added) {
-                        input_id = ggml_get_rows(ctx0, input_id, inp_out_ids);
-                    }
                    cb(cur, "fa_get_rows", il_cb);
                }

@@ -9712,16 +9697,23 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                    cb(cur, "kqv_wo_biased", il_cb);
                    output_bias_added = true;
                }
-                if (add_input && !input_added) {
-                    cur = ggml_add(ctx0, cur, input_id);
-                    input_added = true;
-                }
                if (cur->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) {
                    cur = ggml_cast(ctx0, cur, lctx.cparams.reduce_type);
                }
                ggml_build_forward_expand(gf, cur);
                attn[id] = cur;
+                last_id = id;
            }
+            GGML_ASSERT(last_id >= 0);
+            if (add_input) {
+                if (inp_out_ids) {
+                    input = ggml_get_rows(ctx0, input, inp_out_ids);
+                    cb(input, "sainp_get_rows", il);
+                }
+                attn[last_id] = ggml_add(ctx0, attn[last_id], input);
+                cb(attn[last_id], "attn_out_with_input", il);
+            }
+
            auto cur = ggml_reduce(ctx0, attn.data(), wq->n_device, GGML_OP_ADD);
            ggml_build_forward_expand(gf, cur);
            cb(cur, "attn_combined", il);
@@ -9792,7 +9784,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
    }

-    if (inp_out_ids) { // && ggml_nrows(inp_out_ids) > 1) {
+    if (inp_out_ids) {
        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
        cb(cur, "sa_get_rows", il);
        if (add_input) {