qwen3next: split cpu/cuda eval builds and tune PP scheduling

This commit is contained in:
yurko
2026-02-06 19:28:17 -08:00
parent e64b43392f
commit 6db8dc86ca
3 changed files with 146 additions and 20 deletions

View File

@@ -4984,8 +4984,18 @@ struct llama_context * llama_new_context_with_model(
}
}
if (params.only_active_experts) {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
bool only_active_experts = params.only_active_experts;
if (only_active_experts &&
model->arch == LLM_ARCH_QWEN3NEXT &&
model->has_tensor_overrides() &&
cparams.n_batch >= 512) {
// In large-batch hybrid CPU/GPU MoE prompt processing, moving only active experts can
// add synchronization and copy overhead. Disable this mode for this Qwen3Next path.
LLAMA_LOG_INFO("%s: disabling only_active_experts for Qwen3Next large-batch hybrid MoE prompt path\n", __func__);
only_active_experts = false;
}
if (only_active_experts) {
LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__);
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
}
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {