Avoid ggml_get_rows if not necessary (#1160)

* Copy reduce result to other GPUs if necessary

* Avoid ggml_get_rows for TG

* For the output ops use the result of the split that ran on the main GPU

* More models
This commit is contained in:
Kawrakow
2026-01-20 15:38:21 +02:00
committed by GitHub
parent 132a01d25d
commit 996e77047a
5 changed files with 132 additions and 109 deletions

View File

@@ -2244,7 +2244,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
if (split->graph.nodes[0]->op == GGML_OP_REDUCE) {
if (split->graph.nodes[0]->op == GGML_OP_REDUCE && i < sched->n_splits - 1) {
last_reduce = split_backend_id;
if (ith == split_backend_id) {
auto node = split->graph.nodes[0];
@@ -2318,7 +2318,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
if (split->graph.nodes[0]->op == GGML_OP_REDUCE) {
if (split->graph.nodes[0]->op == GGML_OP_REDUCE && i < sched->n_splits - 1) {
last_reduce = split_backend_id;
barrier.arrive_and_wait();
if (ith == split_backend_id) {