Change flash attention and fmoe to be on by default (#863)

* Change fmoe to be on by default

* Change default fmoe also in llama-bench

* Change flash attention to be on by default

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-25 09:37:28 +03:00
committed by GitHub
parent 2522c97dc9
commit 16f30fcf31
4 changed files with 26 additions and 26 deletions

View File

@@ -260,7 +260,7 @@ struct cmd_params {
bool verbose;
bool warmup;
bool repack = false;
bool fmoe = false;
bool fmoe = true;
bool ger = false; // ger = Grouped Expert Routing
bool no_fug = false;
bool use_thp = false;
@@ -285,7 +285,7 @@ static const cmd_params cmd_params_defaults = {
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* flash_attn */ {false},
/* flash_attn */ {true},
/* mla_attn */ {0},
/* attn_max_batch */ {0},
/* ser */ {{-1,0.0f}},
@@ -298,7 +298,7 @@ static const cmd_params cmd_params_defaults = {
/* verbose */ false,
/* warmup */ true,
/* repack */ false,
/* fmoe */ false,
/* fmoe */ true,
/* ger */ false,
/* no_fug */ false,
/* use_thp */ false,
@@ -846,7 +846,7 @@ struct cmd_params_instance {
bool use_mmap;
bool embeddings;
bool repack = false;
bool fmoe = false;
bool fmoe = true;
bool ger = false;
bool no_fug = false;
bool use_thp = false;