Offload only activated experts to the GPU (#698)

* Offload only activated experts * This seems to do the trick for -fmoe * Do not recalculate activated expers for fused up/gate * Log out of bounds access details * Add a command line argument --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-07 20:40:02 +00:00 · 2025-09-04 12:22:30 +02:00
parent 144d456717
commit 13c3b6412e
8 changed files with 155 additions and 45 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18965,6 +18965,7 @@ struct llama_context_params llama_context_default_params() {
        /*.fused_up_gate               =*/ true,
        /*.min_experts                 =*/ -1,
        /*.thtesh_experts              =*/ 0.0f,
+        /*.only_active_experts         =*/ false,
        /*.abort_callback              =*/ nullptr,
        /*.abort_callback_data         =*/ nullptr,
        /*.offload_policy              =*/ nullptr,
@@ -19556,6 +19557,11 @@ struct llama_context * llama_new_context_with_model(
        }
    }

+    if (params.only_active_experts) {
+        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
+        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
+    }
+
    return ctx;
 }