Offload only activated experts to the GPU (#698)

* Offload only activated experts

* This seems to do the trick for -fmoe

* Do not recalculate activated expers for fused up/gate

* Log out of bounds access details

* Add a command line argument

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-09-04 12:22:30 +02:00
committed by GitHub
parent 144d456717
commit 13c3b6412e
8 changed files with 155 additions and 45 deletions

View File

@@ -18965,6 +18965,7 @@ struct llama_context_params llama_context_default_params() {
/*.fused_up_gate =*/ true,
/*.min_experts =*/ -1,
/*.thtesh_experts =*/ 0.0f,
/*.only_active_experts =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.offload_policy =*/ nullptr,
@@ -19556,6 +19557,11 @@ struct llama_context * llama_new_context_with_model(
}
}
if (params.only_active_experts) {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
}
return ctx;
}