mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 18:32:04 +00:00
Set mla=3 by default (#943)
so more recent users that haven't followed the history of FlashMLA evolution and hence don't know about the MLA options get the best setting without having to add -mla 3 on the command line. Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -247,7 +247,7 @@ struct gpt_params {
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = true; // flash attention
|
||||
int mla_attn = 0; // MLA 0: standard attention, 1: MLA with K and transposed V cache, 2: MLA with just K cache
|
||||
int mla_attn = 3; // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
|
||||
int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false)
|
||||
bool fused_moe_up_gate = true; // fused up*unary(gate) op for MoE models
|
||||
bool fused_up_gate = true; // fused up*unary(gate) op
|
||||
|
||||
Reference in New Issue
Block a user