Graph parallel for Qwen-3.5-MoE (#1347)

* Graph parallel for Qwen3.5-MoE

* Add --max-gpu to llama-bench

* Fix graph reuse when not all GPUs participate in self-attention
This commit is contained in:
Kawrakow
2026-03-02 07:48:43 +01:00
committed by GitHub
parent 8f9e19d57c
commit d239dabcc6
4 changed files with 58 additions and 18 deletions

View File

@@ -590,6 +590,7 @@ bool llama_context::update_cache_copies() {
GGML_ASSERT(kl->n_device == vl->n_device);
}
for (int id = 0; id < kl->n_device; ++id) {
if (!kl->splits[id]) continue;
auto& c = cache_copies[2*model.splits.size()*il + 2*id + 0];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) return false;
c.cpy->view_offs = kv_self.head*c.step;
@@ -598,6 +599,7 @@ bool llama_context::update_cache_copies() {
}
if (!vl) continue;
for (int id = 0; id < vl->n_device; ++id) {
if (!vl->splits[id]) continue;
auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) return false;
c.cpy->view_offs = kv_self.head*c.step;
@@ -1939,6 +1941,7 @@ static bool is_model_split_supported(const llama_model & model) {
LLM_ARCH_STEP35,
LLM_ARCH_QWEN3NEXT,
LLM_ARCH_QWEN35,
LLM_ARCH_QWEN35MOE,
};
auto it = k_supported.find(model.arch);
return it != k_supported.end();