qwen3next: avoid extra cont on linear attention output

2026-04-25 08:59:30 +00:00 · 2026-02-07 13:30:29 -08:00
parent 0e3891b348
commit 43edfa237b
1 changed files with 1 additions and 1 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -4773,7 +4773,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
        ggml_tensor * out = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, final_output);
        cb(out, "linear_attn_out", il);

-        return ggml_cont_2d(ctx0, out, n_embd, n_tok);
+        return ggml_reshape_2d(ctx0, out, n_embd, n_tok);
    };

    auto build_layer_attn_linear = [&](ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,