Fix llama-bench mla parameter (#1016)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2025-11-27 09:33:30 +01:00
parent 8c39ff966d
commit 45cd1a70f5
2 changed files with 6 additions and 2 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -937,6 +937,7 @@ struct cmd_params_instance {
        mparams.use_thp = use_thp;
        mparams.merge_qkv = mqkv;
        mparams.tensor_buft_overrides = buft_overrides;
+        mparams.mla = mla_attn;

        return mparams;
    }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1415,6 +1415,9 @@ static void llm_prepare_mla(llama_model & model, int mla) {
            l.wk_b = l.computed_wk_b.get();
            model.tensors_by_name.push_back(std::make_pair(name, l.wk_b));

+            printf("Computed %s as %ld x %ld x %ld and stored in buffer %s\n", name.c_str(), wk_b->ne[0], wk_b->ne[1], wk_b->ne[2],
+                    ggml_backend_buffer_name(l.computed_wk_b->buffer));
+
            ggml_graph_clear(graph);
            auto wv_b = ggml_cont(ctx, ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_v, n_head,
                        l.wkv_b->nb[1], l.wkv_b->nb[1]*(n_embd_head_qk_nope + n_embd_head_v), l.wkv_b->nb[1]*n_embd_head_qk_nope));
@@ -1444,8 +1447,8 @@ static void llm_prepare_mla(llama_model & model, int mla) {
            l.wv_b = l.computed_wv_b.get();
            model.tensors_by_name.push_back(std::make_pair(name, l.wv_b));

-            printf("Computed %s as %ld x %ld x %ld and stored in buffer %s\n", name.c_str(), wk_b->ne[0], wk_b->ne[1], wk_b->ne[2],
-                    ggml_backend_buffer_name(l.computed_wk_b->buffer));
+            printf("Computed %s as %ld x %ld x %ld and stored in buffer %s\n", name.c_str(), wv_b->ne[0], wv_b->ne[1], wv_b->ne[2],
+                    ggml_backend_buffer_name(l.computed_wv_b->buffer));

            ggml_graph_clear(graph);
        }