mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-08 04:50:13 +00:00
Graph parallel for Qwen-3.5-MoE (#1347)
* Graph parallel for Qwen3.5-MoE * Add --max-gpu to llama-bench * Fix graph reuse when not all GPUs participate in self-attention
This commit is contained in:
@@ -590,6 +590,7 @@ bool llama_context::update_cache_copies() {
|
||||
GGML_ASSERT(kl->n_device == vl->n_device);
|
||||
}
|
||||
for (int id = 0; id < kl->n_device; ++id) {
|
||||
if (!kl->splits[id]) continue;
|
||||
auto& c = cache_copies[2*model.splits.size()*il + 2*id + 0];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
@@ -598,6 +599,7 @@ bool llama_context::update_cache_copies() {
|
||||
}
|
||||
if (!vl) continue;
|
||||
for (int id = 0; id < vl->n_device; ++id) {
|
||||
if (!vl->splits[id]) continue;
|
||||
auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1];
|
||||
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) return false;
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
@@ -1939,6 +1941,7 @@ static bool is_model_split_supported(const llama_model & model) {
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_QWEN3NEXT,
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
};
|
||||
auto it = k_supported.find(model.arch);
|
||||
return it != k_supported.end();
|
||||
|
||||
Reference in New Issue
Block a user