Change flash attention and fmoe to be on by default (#863)

* Change fmoe to be on by default * Change default fmoe also in llama-bench * Change flash attention to be on by default --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-25 00:49:34 +00:00 · 2025-10-25 09:37:28 +03:00
parent 2522c97dc9
commit 16f30fcf31
4 changed files with 26 additions and 26 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -260,7 +260,7 @@ struct cmd_params {
    bool verbose;
    bool warmup;
    bool repack = false;
-    bool fmoe = false;
+    bool fmoe = true;
    bool ger = false;     // ger = Grouped Expert Routing
    bool no_fug = false;
    bool use_thp = false;
@@ -285,7 +285,7 @@ static const cmd_params cmd_params_defaults = {
    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu             */ {0},
    /* no_kv_offload        */ {false},
-    /* flash_attn           */ {false},
+    /* flash_attn           */ {true},
    /* mla_attn             */ {0},
    /* attn_max_batch       */ {0},
    /* ser                  */ {{-1,0.0f}},
@@ -298,7 +298,7 @@ static const cmd_params cmd_params_defaults = {
    /* verbose              */ false,
    /* warmup               */ true,
    /* repack               */ false,
-    /* fmoe                 */ false,
+    /* fmoe                 */ true,
    /* ger                  */ false,
    /* no_fug               */ false,
    /* use_thp              */ false,
@@ -846,7 +846,7 @@ struct cmd_params_instance {
    bool use_mmap;
    bool embeddings;
    bool repack = false;
-    bool fmoe = false;
+    bool fmoe = true;
    bool ger = false;
    bool no_fug = false;
    bool use_thp = false;