From 70654881350bb0739b637755d7e6fdc522587761 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Tue, 24 Feb 2026 15:22:30 +0100 Subject: [PATCH] Slightly better graph parallel for Qwen3-Next (#1307) * Make sure we pick the reduced tensor from the right GPU * Minor --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend.cpp | 10 ++++++++++ src/llama-build-context.cpp | 24 ++++++++++++++++-------- src/llama.cpp | 5 +++-- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index e75606dc..4083e734 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -186,6 +186,7 @@ extern "C" { GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); + GGML_API int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer); // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 09f876d0..17f57123 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -2664,6 +2664,16 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) return sched->backends[i]; } +int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { + if (!buffer || !buffer->buft) return -1; + if (buffer && buffer->buft) { + for (int i = 0; i < sched->n_backends; ++i) { + if (ggml_backend_get_default_buffer_type(sched->backends[i]) == buffer->buft) return i; + } + } + return -1; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index c9209370..a7f9e785 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1986,9 +1986,12 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml } } } else { - if (cur->op == GGML_OP_REDUCE && cur->src[lctx.model.main_gpu]) { + int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer]; + int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer); + if (idx_out >= 0) idx = idx_out; + if (cur->op == GGML_OP_REDUCE && cur->src[idx]) { // avoid copy to main GPU - cur->view_src = cur->src[lctx.model.main_gpu]; + cur->view_src = cur->src[idx]; } if (output_norm) { cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1); @@ -4458,12 +4461,18 @@ ggml_cgraph * llm_build_context::build_qwen3next() { if (hparams.is_recurrent(il)) { - if (inpL->op == GGML_OP_REDUCE && inpL->src[model.default_layer_device[il]]) { - inpL->view_src = inpL->src[model.default_layer_device[il]]; - //printf("Using reduce result on device %d\n", model.default_layer_device[il]); - //inpL = inpL->src[model.default_layer_device[il]]; + int idx = model.default_layer_device[il]; + if (inpL->op == GGML_OP_REDUCE) { + if (kv_self.s_l[il]) { + // This shouldn't be necessary, but just in case. + int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); + if (idx_s_l >= 0) idx = idx_s_l; + } + if (inpL->src[idx]) { + inpL->view_src = inpL->src[idx]; + } } - auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[model.default_layer_device[il]] : model.layers[il].attn_norm; + auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb); @@ -4474,7 +4483,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() { cur = ggml_add(ctx0, cur, inpSA); cb(cur, "attn_residual", il); } else { - //cur = build_layer_attn(cur, inp_pos, KQ_mask, il); cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, false); } diff --git a/src/llama.cpp b/src/llama.cpp index 573c042d..71d27dd3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2049,9 +2049,10 @@ static bool llm_load_tensors( // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]); } - auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]); + //auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]); // assign the repeating layers for (int i = i_gpu_start; i < n_layer; ++i) { + auto buft_layer = llama_default_buffer_type_offload(model, model.default_layer_device[i]); if (split_mode == LLAMA_SPLIT_MODE_ATTN) { int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin(); @@ -2065,7 +2066,7 @@ static bool llm_load_tensors( if (n_gpu_layers > n_layer) { model.buft_output = { split_buft, - llama_default_buffer_type_offload(model, model.devices[main_gpu]) + llama_default_buffer_type_offload(model, model.default_layer_device[n_layer]) }; } else { model.buft_output = llama_default_buffer_type_cpu(true);