mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Set mla=3 by default (#943)
so more recent users that haven't followed the history of FlashMLA evolution and hence don't know about the MLA options get the best setting without having to add -mla 3 on the command line. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -247,7 +247,7 @@ struct gpt_params {
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = true; // flash attention
|
||||
int mla_attn = 0; // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
|
||||
int mla_attn = 3; // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
|
||||
int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false)
|
||||
bool fused_moe_up_gate = true; // fused up*unary(gate) op for MoE models
|
||||
bool fused_up_gate = true; // fused up*unary(gate) op
|
||||
|
||||
@@ -1820,7 +1820,9 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
llm_prepare_mla(model, mla_attn);
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
llm_prepare_mla(model, mla_attn);
|
||||
}
|
||||
|
||||
if (use_mmap_buffer) {
|
||||
for (auto & mapping : ml.mappings) {
|
||||
@@ -3831,7 +3833,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ true,
|
||||
/*.mla_attn =*/ 0,
|
||||
/*.mla_attn =*/ 3,
|
||||
/*.attn_max_batch =*/ 0,
|
||||
/*.fused_moe_up_gate =*/ true,
|
||||
/*.grouped_expert_routing =*/ false,
|
||||
@@ -4208,10 +4210,10 @@ struct llama_context * llama_new_context_with_model(
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn > 0) {
|
||||
LLAMA_LOG_WARN("=====================================================================\n");
|
||||
LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
|
||||
LLAMA_LOG_WARN("=====================================================================\n");
|
||||
if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn != 0) {
|
||||
//LLAMA_LOG_WARN("=====================================================================\n");
|
||||
//LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
|
||||
//LLAMA_LOG_WARN("=====================================================================\n");
|
||||
cparams.mla_attn = 0;
|
||||
}
|
||||
|
||||
@@ -4219,7 +4221,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||
if (model->arch == LLM_ARCH_DEEPSEEK2) {
|
||||
LLAMA_LOG_INFO("%s: mla_attn = %d\n", __func__, cparams.mla_attn);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: attn_max_b = %d\n", __func__, cparams.attn_max_batch);
|
||||
LLAMA_LOG_INFO("%s: fused_moe = %d\n", __func__, cparams.fused_moe_up_gate);
|
||||
LLAMA_LOG_INFO("%s: grouped er = %d\n", __func__, cparams.grouped_expert_routing);
|
||||
@@ -4451,7 +4455,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user