mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-07 20:40:02 +00:00
qwen3next: split cpu/cuda eval builds and tune PP scheduling
This commit is contained in:
@@ -4984,8 +4984,18 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
if (params.only_active_experts) {
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
|
||||
bool only_active_experts = params.only_active_experts;
|
||||
if (only_active_experts &&
|
||||
model->arch == LLM_ARCH_QWEN3NEXT &&
|
||||
model->has_tensor_overrides() &&
|
||||
cparams.n_batch >= 512) {
|
||||
// In large-batch hybrid CPU/GPU MoE prompt processing, moving only active experts can
|
||||
// add synchronization and copy overhead. Disable this mode for this Qwen3Next path.
|
||||
LLAMA_LOG_INFO("%s: disabling only_active_experts for Qwen3Next large-batch hybrid MoE prompt path\n", __func__);
|
||||
only_active_experts = false;
|
||||
}
|
||||
if (only_active_experts) {
|
||||
LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__);
|
||||
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
|
||||
}
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {
|
||||
|
||||
Reference in New Issue
Block a user