mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-09 07:50:10 +00:00
Playing games with the scheduler
This change tricks it into doing the right thing^TM. Still quite a bit slower than split mode layer for the 8B LlaMA model. But for the 70B LlaMA it now beats split mode layer for TG: 28 t/s vs 24.4 t/s. PP is 627 t/s vs 744 t/s. In comparison, split mode "row" in mainline gets 484 t/s PP and 19.3 t/s TG.
This commit is contained in:
@@ -1632,7 +1632,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||
if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
|
||||
need_new_split = true;
|
||||
}
|
||||
else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
|
||||
@@ -680,6 +680,7 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
cur = ggml_add(ctx, cur, ffn[id]);
|
||||
cb(cur, "combine_ffn", il);
|
||||
}
|
||||
cur->op_params[0] = 0xff;
|
||||
return cur;
|
||||
}
|
||||
|
||||
@@ -9088,6 +9089,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
cur = ggml_add(ctx0, cur, attn[id]);
|
||||
cb(cur, "combine_attn", il);
|
||||
}
|
||||
cur->op_params[0] = 0xff;
|
||||
return cur;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user