From ddc88bac17bac6faf98cb1f3c1ec33f756e72001 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 12 Nov 2025 11:00:58 +0200
Subject: [PATCH] Set mla=3 by default (#943)

so more recent users that haven't followed the history of FlashMLA
evolution and hence don't know about the MLA options get the best setting
without having to add -mla 3 on the command line.

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.h |  2 +-
 src/llama.cpp   | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/common/common.h b/common/common.h
index 776fa255..ee959618 100644
--- a/common/common.h
+++ b/common/common.h
@@ -247,7 +247,7 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = true;  // flash attention
-    int  mla_attn          = 0;     // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
+    int  mla_attn          = 3;     // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
     int  attn_max_batch    = 0;     // Max batch size to use when computing attention (only applicable if flash_attn = false)
     bool fused_moe_up_gate = true;  // fused up*unary(gate) op for MoE models
     bool fused_up_gate     = true;  // fused up*unary(gate) op
diff --git a/src/llama.cpp b/src/llama.cpp
index 413d3fae..8bb4e266 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1820,7 +1820,9 @@ static bool llm_load_tensors(
         }
     }
 
-    llm_prepare_mla(model, mla_attn);
+    if (model.arch == LLM_ARCH_DEEPSEEK2) {
+        llm_prepare_mla(model, mla_attn);
+    }
 
     if (use_mmap_buffer) {
         for (auto & mapping : ml.mappings) {
@@ -3831,7 +3833,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ true,
-        /*.mla_attn                    =*/ 0,
+        /*.mla_attn                    =*/ 3,
         /*.attn_max_batch              =*/ 0,
         /*.fused_moe_up_gate           =*/ true,
         /*.grouped_expert_routing      =*/ false,
@@ -4208,10 +4210,10 @@ struct llama_context * llama_new_context_with_model(
         params.seed = time(NULL);
     }
 
-    if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn > 0) {
-        LLAMA_LOG_WARN("=====================================================================\n");
-        LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
-        LLAMA_LOG_WARN("=====================================================================\n");
+    if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn != 0) {
+        //LLAMA_LOG_WARN("=====================================================================\n");
+        //LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
+        //LLAMA_LOG_WARN("=====================================================================\n");
         cparams.mla_attn = 0;
     }
 
@@ -4219,7 +4221,9 @@ struct llama_context * llama_new_context_with_model(
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",     __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",     __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",     __func__, cparams.flash_attn);
+    if (model->arch == LLM_ARCH_DEEPSEEK2) {
     LLAMA_LOG_INFO("%s: mla_attn      = %d\n",     __func__, cparams.mla_attn);
+    }
     LLAMA_LOG_INFO("%s: attn_max_b    = %d\n",     __func__, cparams.attn_max_batch);
     LLAMA_LOG_INFO("%s: fused_moe     = %d\n",     __func__, cparams.fused_moe_up_gate);
     LLAMA_LOG_INFO("%s: grouped er    = %d\n",     __func__, cparams.grouped_expert_routing);
@@ -4451,7 +4455,7 @@ struct llama_context * llama_new_context_with_model(
                             (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                             ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                             ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-		}
+                }
             }
         }