WIP: fast PP with bespoke 4-GPU NCCL

I guess, I'm not using NCCL the right way as PP is very low with a single communicator group for 3 or more GPUs. But if I create 4 communicator groups for pairs of GPUs (0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting 1500 t/s for L3-70B on the 4x3090 system, which is ~20% better than the previous sm graph without NCCL. But that cannot be the solution (I cannot be creating pairwise communicators and associated logic for every possible number of GPUs).
2026-03-11 14:30:02 +00:00 · 2025-12-22 07:22:29 +00:00
parent 297f82ed02
commit d50ef0165e
2 changed files with 109 additions and 3 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -250,6 +250,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #ifdef GGML_USE_NCCL
    info.have_nccl = false;
    if (info.device_count > 1) {
+        if (info.device_count == 4) {
+            int devs[8] = {0,1, 2,3, 0,2, 1,3};
+            for (int ip = 0; ip < 4; ++ip) {
+                if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) {
+                    printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
+                    GGML_ABORT("Fatal error");
+                }
+            }
+            int gpus[4] = {0, 1, 2, 3};
+            if (auto status = ncclCommInitAll(info.nccl_coms+8, 4, gpus); status != ncclSuccess) {
+                printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status));
+                GGML_ABORT("Fatal error");
+            }
+            info.have_nccl = true;
+            printf("=============================== NCCL initialized\n");
+        } else {
        int gpu_list[GGML_CUDA_MAX_DEVICES];
        for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
        auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
@@ -258,6 +274,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
            info.have_nccl = true;
        } else {
            printf("=============================== NCCL initialization failed with status %d\n", int(status));
+            GGML_ABORT("Fatal error");
+        }
        }
    }
 #endif
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -7,6 +7,15 @@

 #include "reduce.cuh"

+#include <chrono>
+
+template <typename T, int block_size>
+static __global__ void k_add(int nelem, const T * src, T * dst) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    dst[i] += src[i];
+}
+
 void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    auto op = (ggml_op)dst->op_params[0];
@@ -19,8 +28,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_

    //printf("============================== %s on device %d with %d sources\n", __func__, ctx.device, nreduce);

-#ifdef GGML_USE_NCCL
    auto & info = ggml_cuda_info();
+#ifdef GGML_USE_NCCL
    GGML_ASSERT(info.have_nccl);
    GGML_ASSERT(info.device_count == nreduce);
    auto type = dst->type;
@@ -28,6 +37,38 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    if (nreduce != info.device_count) {
        GGML_ABORT("Not implemented");
    }
+    //auto tim1 = std::chrono::steady_clock::now();
+    if (nreduce == 4) {
+        auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
+        if (dst->ne[1] > 32) {
+            static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
+            for (int ip = 0; ip < 4; ++ip) {
+                ncclGroupStart();
+                auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
+                        ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
+                auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
+                        ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
+                ncclGroupEnd();
+                if (status1 != ncclSuccess || status2 != ncclSuccess) {
+                    fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
+                    GGML_ABORT("Fatal error");
+                }
+            }
+        } else {
+            ncclGroupStart();
+            for (int i = 0; i < nreduce; ++i) {
+                auto stream = info.all_ctx[i]->stream();
+                GGML_ASSERT(stream);
+                auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum,
+                        info.nccl_coms[8+i], stream);
+                if (status != ncclSuccess) {
+                    fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status);
+                    GGML_ABORT("Fatal error");
+                }
+            }
+            ncclGroupEnd();
+        }
+    } else {
    ncclGroupStart();
    for (int i = 0; i < nreduce; ++i) {
        ncclComm_t this_comm;
@@ -57,8 +98,55 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        }
    }
    ncclGroupEnd();
+    }
+    //auto tim2 = std::chrono::steady_clock::now();
+    //printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
    return;
 #endif
-    fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
-    GGML_ABORT("Fatal error");
+    //auto tim1 = std::chrono::steady_clock::now();
+    //GGML_ASSERT(nhave == nreduce);
+    //auto nbytes = ggml_nbytes(dst);
+    //ggml_cuda_pool_alloc<char> buffer(ctx.pool(), nbytes*(nhave-1));
+    //auto ptr = buffer.ptr;
+    //for (int i = 0; i < nhave; ++i) {
+    //    GGML_ASSERT(dst->src[i]->type == dst->type);
+    //    GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
+    //    if (i == ctx.device) continue;
+    //    ggml_cuda_set_device(i);
+    //    CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
+    //    if (!info.all_ctx[i]->copy_event) {
+    //        CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+    //    }
+    //    CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+    //    ptr += nbytes;
+    //}
+    //auto nelem = ggml_nelements(dst);
+    //int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
+    //ggml_cuda_set_device(ctx.device);
+    //ptr = buffer.ptr;
+    //for (int i = 0; i < nhave; ++i) {
+    //    if (i == ctx.device) continue;
+    //    CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
+    //    if (dst->type == GGML_TYPE_F16) {
+    //        k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->src[nhave-1]->data);
+    //    } else {
+    //        k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const float *)ptr, (float *)dst->src[nhave-1]->data);
+    //    }
+    //    ptr += nbytes;
+    //}
+    //if (!ctx.copy_event) {
+    //    CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming));
+    //}
+    //CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream()));
+    //for (int i = 0; i < nhave; ++i) {
+    //    if (i == ctx.device) continue;
+    //    ggml_cuda_set_device(i);
+    //    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
+    //    CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->src[nhave-1]->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
+    //}
+    //ggml_cuda_set_device(ctx.device);
+    //auto tim2 = std::chrono::steady_clock::now();
+    //printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
+    //fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
+    //GGML_ABORT("Fatal error");
 }