From 52a7cbe482a245c2bb86e2f71bc53315d7040b86 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 26 Nov 2025 20:34:37 +0000 Subject: [PATCH] Playing games with the scheduler This change tricks it into doing the right thing^TM. Still quite a bit slower than split mode layer for the 8B LlaMA model. But for the 70B LlaMA it now beats split mode layer for TG: 28 t/s vs 24.4 t/s. PP is 627 t/s vs 744 t/s. In comparison, split mode "row" in mainline gets 484 t/s PP and 19.3 t/s TG. --- ggml/src/ggml-backend.cpp | 5 ++++- src/llama-build-context.cpp | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ec56a7b0..f7c1593b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1632,7 +1632,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; - if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) { + need_new_split = true; + } + else if (node_backend_id == cur_backend_id && split->n_inputs > 0) { for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 2711dd04..2b2dc8d8 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -680,6 +680,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( cur = ggml_add(ctx, cur, ffn[id]); cb(cur, "combine_ffn", il); } + cur->op_params[0] = 0xff; return cur; } @@ -9088,6 +9089,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens cur = ggml_add(ctx0, cur, attn[id]); cb(cur, "combine_attn", il); } + cur->op_params[0] = 0xff; return cur; } }