Fix ring reduction (#1114)

* Fix ring reduction * Actually enable it --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2026-01-07 08:01:31 +02:00
parent 54a513768c
commit a82dcbf3ee
2 changed files with 26 additions and 10 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -843,6 +843,7 @@ struct ggml_backend_cuda_context {
    int device;
    std::string name;
    cudaEvent_t copy_event = nullptr;
+    cudaEvent_t compute_event = nullptr;
    bool p2p_enabled = false;

    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -196,7 +196,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    //   i = 0, peer = 1, ichunk = 1 -> copy part 1 from device 1, device 0 now has parts 0, 1, 2, 3
    //   etc.
    //
-    if (false && dst->ne[1] >= 32) {
+    if (dst->ne[1] >= 32) {
        auto nelem = ggml_nelements(dst);
        auto elem_size = ggml_element_size(dst);
        auto nelem_per_device = (nelem + nhave - 1)/nhave;
@@ -204,17 +204,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
            auto this_ctx = info.all_ctx[i];
-            if (!this_ctx->copy_event) {
+            if (!this_ctx->copy_event || !this_ctx->compute_event || required_size > this_ctx->copy_size) {
                ggml_cuda_set_device(this_ctx->device);
-                CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->copy_event, cudaEventDisableTiming));
-            }
-            if (required_size > this_ctx->copy_size) {
-                ggml_cuda_set_device(this_ctx->device);
-                if (this_ctx->copy_buffer) {
-                    CUDA_CHECK(cudaFree(this_ctx->copy_buffer));
+                if (!this_ctx->copy_event) {
+                    CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->copy_event, cudaEventDisableTiming));
+                }
+                if (!this_ctx->compute_event) {
+                    CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->compute_event, cudaEventDisableTiming));
+                }
+                if (required_size > this_ctx->copy_size) {
+                    if (this_ctx->copy_buffer) {
+                        CUDA_CHECK(cudaFree(this_ctx->copy_buffer));
+                    }
+                    CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, required_size, this_ctx->device));
+                    this_ctx->copy_size = required_size;
                }
-                CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, required_size, this_ctx->device));
-                this_ctx->copy_size = required_size;
            }
        }
        for (int stage = 0; stage < nhave-1; ++stage) {
@@ -224,10 +228,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                if (stage > 0) {
+                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                }
                CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
                            (const char *)dst->src[peer]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[peer]->device,
                            this_nelem*elem_size, info.all_ctx[peer]->stream()));
                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
+                ichunk = (ichunk + 1)%nhave;
+            }
+            ichunk = stage;
+            for (int ii = 0; ii < nhave; ++ii) {
+                int i = idx[ii];
+                int peer = idx[(ii+1)%nhave];
+                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                ggml_cuda_set_device(info.all_ctx[i]->device);
                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
                int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
@@ -238,6 +252,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
                            (const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
                }
+                CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
                ichunk = (ichunk + 1)%nhave;
            }
        }