mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-07 20:40:02 +00:00
Offload only activated experts to the GPU (#698)
* Offload only activated experts * This seems to do the trick for -fmoe * Do not recalculate activated expers for fused up/gate * Log out of bounds access details * Add a command line argument --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -18965,6 +18965,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.fused_up_gate =*/ true,
|
||||
/*.min_experts =*/ -1,
|
||||
/*.thtesh_experts =*/ 0.0f,
|
||||
/*.only_active_experts =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.offload_policy =*/ nullptr,
|
||||
@@ -19556,6 +19557,11 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
if (params.only_active_experts) {
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
|
||||
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user