qwen3next: split cpu/cuda eval builds and tune PP scheduling

2026-03-07 20:40:02 +00:00 · 2026-02-06 19:28:17 -08:00
parent e64b43392f
commit 6db8dc86ca
3 changed files with 146 additions and 20 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4984,8 +4984,18 @@ struct llama_context * llama_new_context_with_model(
        }
    }

-    if (params.only_active_experts) {
-        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
+    bool only_active_experts = params.only_active_experts;
+    if (only_active_experts &&
+        model->arch == LLM_ARCH_QWEN3NEXT &&
+        model->has_tensor_overrides() &&
+        cparams.n_batch >= 512) {
+        // In large-batch hybrid CPU/GPU MoE prompt processing, moving only active experts can
+        // add synchronization and copy overhead. Disable this mode for this Qwen3Next path.
+        LLAMA_LOG_INFO("%s: disabling only_active_experts for Qwen3Next large-batch hybrid MoE prompt path\n", __func__);
+        only_active_experts = false;
+    }
+    if (only_active_experts) {
+        LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__);
        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
    }
    if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {