Fix data races in the reduce op (#1124)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-12 15:00:11 +00:00 · 2026-01-09 10:34:58 +02:00
parent eaf2e1c15a
commit 8725d110d2
1 changed files with 4 additions and 1 deletions
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -263,6 +263,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                if (stage == 0) {
+                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                }
                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[i]->device,
                            (const char *)dst->src[peer]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[peer]->device,
                            this_nelem*elem_size, info.all_ctx[peer]->stream()));
@@ -275,7 +278,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        ggml_cuda_set_device(ctx.device);
        return;
    }
-    if (nhave == 4 && dst->ne[1] <= 8 && ctx.p2p_enabled) {
+    if (false && nhave == 4 && dst->ne[1] <= 8 && ctx.p2p_enabled) {
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
            GGML_ASSERT(dst->src[i]->type == dst->type);