Graph parallel for Qwen-3.5-MoE (#1347)

* Graph parallel for Qwen3.5-MoE * Add --max-gpu to llama-bench * Fix graph reuse when not all GPUs participate in self-attention
2026-03-08 04:50:13 +00:00 · 2026-03-02 07:48:43 +01:00
parent 8f9e19d57c
commit d239dabcc6
4 changed files with 58 additions and 18 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -590,6 +590,7 @@ bool llama_context::update_cache_copies() {
                GGML_ASSERT(kl->n_device == vl->n_device);
            }
            for (int id = 0; id < kl->n_device; ++id) {
+                if (!kl->splits[id]) continue;
                auto& c = cache_copies[2*model.splits.size()*il + 2*id + 0];
                if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) return false;
                c.cpy->view_offs = kv_self.head*c.step;
@@ -598,6 +599,7 @@ bool llama_context::update_cache_copies() {
            }
            if (!vl) continue;
            for (int id = 0; id < vl->n_device; ++id) {
+                if (!vl->splits[id]) continue;
                auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1];
                if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) return false;
                c.cpy->view_offs = kv_self.head*c.step;
@@ -1939,6 +1941,7 @@ static bool is_model_split_supported(const llama_model & model) {
        LLM_ARCH_STEP35,
        LLM_ARCH_QWEN3NEXT,
        LLM_ARCH_QWEN35,
+        LLM_ARCH_QWEN35MOE,
    };
    auto it =  k_supported.find(model.arch);
    return it != k_supported.end();