From d6e5fb00d60a28a2521be6ba11c7b36719e47d4d Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 15 Jan 2026 13:29:27 +0000 Subject: [PATCH] WIP: this seems more stable Still hanging after a while if I try to use all 7 GPUs --- ggml/src/ggml-backend.cpp | 35 ++++++++++++++++++++++++++--------- ggml/src/ggml-cuda/reduce.cu | 8 +++++++- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index a2ffc867..985a24e5 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -2180,7 +2180,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } - if (!has_cpu_work) { + if (false && !has_cpu_work) { #pragma omp parallel num_threads(sched->n_backends) { @@ -2216,8 +2216,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (ith == split_backend_id) { - // copy the input tensors to the split backend - //ggml_backend_sched_copy_inputs(sched, split, sched->needs_sync, ids, unique_ids, last_ids_tensor); if (split->n_inputs > 0 && !sched->own_cpy[split_backend_id]) { sched->needs_sync[split_backend_id] = true; @@ -2248,8 +2246,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } #endif if (!work_done) { - std::barrier barrier(sched->n_backends, [] () noexcept {}); - auto compute = [sched, &barrier] (int ith) { + int first_reduce = -1; + for (int i = 0; i < sched->n_splits; i++) { + auto split = &sched->splits[i]; + if (split->graph.n_nodes == 1 && split->graph.nodes[0]->op == GGML_OP_REDUCE) { + first_reduce = split->backend_id; + break; + } + } + + std::barrier barrier(sched->n_backends); + auto compute = [sched, &barrier, first_reduce] (int ith) { struct ggml_backend_sched_split * splits = sched->splits; @@ -2257,6 +2264,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector unique_ids; ggml_tensor * last_ids_tensor = nullptr; + int last_reduce = first_reduce; + for (int i = 0; i < sched->n_splits; i++) { #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -2271,10 +2280,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s barrier.arrive_and_wait(); } - if (ith == split_backend_id) { - // copy the input tensors to the split backend - ggml_backend_sched_copy_inputs(sched, split, sched->needs_sync, ids, unique_ids, last_ids_tensor); + if (split->n_inputs > 0) { + int copy_thread = last_reduce >= 0 ? last_reduce : 0; + if (ith == copy_thread) { + ggml_backend_sched_copy_inputs(sched, split, sched->needs_sync, ids, unique_ids, last_ids_tensor); + } + barrier.arrive_and_wait(); + } + if (ith == split_backend_id) { + + sched->statuses[ith] = ggml_backend_sched_eval(sched, split_backend, split); if (split->n_inputs > 0 && !sched->own_cpy[split_backend_id]) { sched->needs_sync[split_backend_id] = true; } else { @@ -2284,10 +2300,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } - sched->statuses[ith] = ggml_backend_sched_eval(sched, split_backend, split); } if (split->graph.nodes[0]->op == GGML_OP_REDUCE) { + last_reduce = split_backend_id; barrier.arrive_and_wait(); } //if (needs_barrier) { @@ -2297,6 +2313,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // record the event of this copy if (split->n_inputs > 0) { if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + printf("Recording event %d, %d\n", split_backend_id, sched->cur_copy); ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]); } } diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index 84df8c31..6898c5fd 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -350,7 +350,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ ggml_cuda_set_device(ctx.device); return; } - if (dst->ne[1] <= 8 && ctx.p2p_enabled) { + if (dst->ne[1] < 32 && ctx.p2p_enabled) { for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; GGML_ASSERT(dst->src[i]->type == dst->type); @@ -479,6 +479,12 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ ggml_cuda_set_device(i); CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0)); CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); } ggml_cuda_set_device(ctx.device); + for (int ii = 0; ii < nhave; ++ii) { + int i = idx[ii]; + if (i == ctx.device) continue; + CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); + } }