From 41a8d05420b113a59b29cb3a87d9d3b84c869dc4 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 25 Dec 2025 08:44:24 +0100
Subject: [PATCH 1/2] Reduce add improvemens without NCCL (#1088)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml/src/ggml-cuda/reduce.cu | 149 +++++++++++++++++++++++------------
 1 file changed, 98 insertions(+), 51 deletions(-)
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
index dda70252..cf8d59e3 100644
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -45,6 +45,23 @@ static __global__ void k_reduce_add(copy_task task) {
     }
 }
 
+template <typename T, int block_size, int nptr>
+static __global__ void k_reduce_add_T(copy_task task) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= task.nelem) return;
+    auto dst = (T *)task.ptrs[0];
+    #pragma unroll
+    for (int j = 1; j < nptr; ++j) {
+        auto src = (T *)task.ptrs[j];
+        dst[i] += src[i];
+    }
+    #pragma unroll
+    for (int j = 1; j < nptr; ++j) {
+        auto src = (T *)task.ptrs[j];
+        src[i] = dst[i];
+    }
+}
+
 void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     auto op = (ggml_op)dst->op_params[0];
@@ -146,53 +163,6 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
 #endif
     GGML_ASSERT(dst->data == dst->src[ctx.device]->data);
     auto nbytes = ggml_nbytes(dst);
-    if (nhave == 2 && (nhave == nreduce || dst->ne[1] <= 8)) {
-        int idx[2];
-        int ii = 0;
-        for (int i = 0; i < nreduce; ++i) {
-            if (dst->src[i]) {
-                idx[ii++] = i;
-            }
-        }
-        // With P2P access enabled, we can access peer memory so as if it was local.
-        // Hence, we can launch two reduce kernels, one on each device, each kernel
-        // processing half of the data. This very simply approach almost matches NCCL
-        // performance (I see ~1% lower PP and TG performance on my 2x3090 system).
-        for (int i = 0; i < nhave; ++i) {
-            GGML_ASSERT(dst->src[idx[i]]->type == dst->type);
-            GGML_ASSERT(ggml_are_same_shape(dst, dst->src[idx[i]]));
-            ggml_cuda_set_device(idx[i]);
-            if (!info.all_ctx[idx[i]]->copy_event) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[idx[i]]->copy_event, cudaEventDisableTiming));
-            }
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream()));
-        }
-        auto nelem = ggml_nelements(dst);
-        auto nelem_half = (nelem + 1)/2;
-        for (int i = 0; i < nhave; ++i) {
-            ggml_cuda_set_device(idx[i]);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[i]]->stream(), info.all_ctx[idx[(i+1)%2]]->copy_event, 0));
-            auto this_nelem = std::min(nelem_half, nelem - nelem_half);
-            int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
-            if (dst->type == GGML_TYPE_F16) {
-                auto src_ptr = (half *)dst->src[idx[i]]->data + i*nelem_half;
-                auto dst_ptr = (half *)dst->src[idx[(i+1)%2]]->data + i*nelem_half;
-                k_add_sym<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[idx[i]]->stream()>>>(this_nelem, src_ptr, dst_ptr);
-            } else {
-                auto src_ptr = (float *)dst->src[idx[i]]->data + i*nelem_half;
-                auto dst_ptr = (float *)dst->src[idx[(i+1)%2]]->data + i*nelem_half;
-                k_add_sym<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[idx[i]]->stream()>>>(this_nelem, src_ptr, dst_ptr);
-            }
-        }
-        for (int i = 0; i < nhave; ++i) {
-            ggml_cuda_set_device(idx[i]);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[idx[i]]->copy_event, info.all_ctx[idx[i]]->stream()));
-            ggml_cuda_set_device(idx[(i+1)%2]);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[idx[(i+1)%2]]->stream(), info.all_ctx[idx[i]]->copy_event));
-        }
-        ggml_cuda_set_device(ctx.device);
-        return;
-    }
     int idx[GGML_CUDA_MAX_DEVICES];
     {
         int ii = 0;
@@ -230,9 +200,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             task.ptrs[1] = (char *)dst->src[j]->data;
             CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
             if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
             } else {
-                k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
             }
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
@@ -252,9 +222,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             task.ptrs[1] = (char *)dst->src[j]->data;
             CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
             if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
             } else {
-                k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
             }
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
@@ -271,6 +241,83 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         ggml_cuda_set_device(ctx.device);
         return;
     }
+    if (dst->ne[1] <= 8) {
+        for (int ii = 0; ii < nhave; ++ii) {
+            int i = idx[ii];
+            GGML_ASSERT(dst->src[i]->type == dst->type);
+            GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
+            ggml_cuda_set_device(i);
+            if (!info.all_ctx[i]->copy_event) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+            }
+            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        }
+        //printf("Recorded events\n");
+        auto nelem = ggml_nelements(dst);
+        auto nelem_per_device = (nelem + nhave - 1)/nhave;
+        auto elem_size = ggml_element_size(dst);
+        for (int ii = 0; ii < nhave; ++ii) {
+            int i = idx[ii];
+            int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device);
+            copy_task task;
+            task.nptr = nhave;
+            task.nelem = this_nelem;
+            task.ptrs[0] = (char *)dst->src[i]->data + ii*nelem_per_device*elem_size;
+            int k = 1;
+            for (int jj = 0; jj < nhave; ++jj) {
+                if (jj == ii) continue;
+                int j = idx[jj];
+                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+                task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size;
+            }
+            int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
+            if (dst->type == GGML_TYPE_F16) {
+                switch (nhave) {
+                    case 2:
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    case 3:
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    case 4:
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    default:
+                        k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                }
+            } else {
+                switch (nhave) {
+                    case 2:
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    case 3:
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    case 4:
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        break;
+                    default:
+                        k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                }
+            }
+        }
+        //printf("Submitted kernels\n");
+        for (int ii = 0; ii < nhave; ++ii) {
+            int i = idx[ii];
+            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        }
+        //printf("Recorded events again\n");
+        for (int ii = 0; ii < nhave; ++ii) {
+            int i = idx[ii];
+            for (int jj = 0; jj < nhave; ++jj) {
+                if (jj == ii) continue;
+                int j = idx[jj];
+                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            }
+        }
+        //printf("All good so far\n");
+        return;
+    }
     auto required_size = nbytes*(nhave-1);
     if (required_size > ctx.copy_size) {
         if (ctx.copy_buffer) {

From 03ed5f7096874e292accc957a7005cd7e2416931 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 25 Dec 2025 08:55:08 +0100
Subject: [PATCH 2/2] Fix split mode graph when p2p is not enabled (#1091)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml/src/ggml-cuda.cu         | 8 ++++++--
 ggml/src/ggml-cuda/common.cuh | 1 +
 ggml/src/ggml-cuda/reduce.cu  | 4 ++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 828ea4c2..ad641c85 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1479,9 +1479,10 @@ static void ggml_cuda_op_mul_mat_cublas(
     GGML_UNUSED(src1_padded_row_size);
 }
 
-static void ggml_cuda_set_peer_access(int main_device) {
+static bool ggml_cuda_set_peer_access(int main_device) {
     ggml_cuda_set_device(main_device);
 
+    bool all_enabled = true;
     for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
         if (main_device == id_other) {
             continue;
@@ -1500,8 +1501,11 @@ static void ggml_cuda_set_peer_access(int main_device) {
                 // reset the error
                 (void)cudaGetLastError();
             }
+        } else {
+            all_enabled = false;
         }
     }
+    return all_enabled;
 }
 
 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
@@ -4453,7 +4457,7 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] con
 
 #if !defined(GGML_CUDA_NO_PEER_COPY)
     if (enable_p2p) {
-        ggml_cuda_set_peer_access(device);
+        ctx->p2p_enabled = ggml_cuda_set_peer_access(device);
     }
 #endif
 
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 27909321..c1034d8d 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -843,6 +843,7 @@ struct ggml_backend_cuda_context {
     int device;
     std::string name;
     cudaEvent_t copy_event = nullptr;
+    bool p2p_enabled = false;
 
     cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
     cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
index cf8d59e3..7b63bd47 100644
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -176,7 +176,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         GGML_ASSERT(ii == nhave);
         GGML_ASSERT(have_this_device);
     }
-    if (nhave == 4 && dst->ne[1] <= 8) {
+    if (nhave == 4 && dst->ne[1] <= 8 && ctx.p2p_enabled) {
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
             GGML_ASSERT(dst->src[i]->type == dst->type);
@@ -241,7 +241,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         ggml_cuda_set_device(ctx.device);
         return;
     }
-    if (dst->ne[1] <= 8) {
+    if (dst->ne[1] <= 8 && ctx.p2p_enabled) {
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
             GGML_ASSERT(dst->src[i]->type == dst->type);