From d50ef0165e8a983a98111ac2f42e65cc9c8c32de Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 22 Dec 2025 07:22:29 +0000 Subject: [PATCH] WIP: fast PP with bespoke 4-GPU NCCL I guess, I'm not using NCCL the right way as PP is very low with a single communicator group for 3 or more GPUs. But if I create 4 communicator groups for pairs of GPUs (0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting 1500 t/s for L3-70B on the 4x3090 system, which is ~20% better than the previous sm graph without NCCL. But that cannot be the solution (I cannot be creating pairwise communicators and associated logic for every possible number of GPUs). --- ggml/src/ggml-cuda.cu | 18 +++++++ ggml/src/ggml-cuda/reduce.cu | 94 ++++++++++++++++++++++++++++++++++-- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 86000052..96103c1f 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -250,6 +250,22 @@ static ggml_cuda_device_info ggml_cuda_init() { #ifdef GGML_USE_NCCL info.have_nccl = false; if (info.device_count > 1) { + if (info.device_count == 4) { + int devs[8] = {0,1, 2,3, 0,2, 1,3}; + for (int ip = 0; ip < 4; ++ip) { + if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) { + printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status)); + GGML_ABORT("Fatal error"); + } + } + int gpus[4] = {0, 1, 2, 3}; + if (auto status = ncclCommInitAll(info.nccl_coms+8, 4, gpus); status != ncclSuccess) { + printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status)); + GGML_ABORT("Fatal error"); + } + info.have_nccl = true; + printf("=============================== NCCL initialized\n"); + } else { int gpu_list[GGML_CUDA_MAX_DEVICES]; for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i; auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list); @@ -258,6 +274,8 @@ static ggml_cuda_device_info ggml_cuda_init() { info.have_nccl = true; } else { printf("=============================== NCCL initialization failed with status %d\n", int(status)); + GGML_ABORT("Fatal error"); + } } } #endif diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index 360ce80a..371d2a6d 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -7,6 +7,15 @@ #include "reduce.cuh" +#include + +template +static __global__ void k_add(int nelem, const T * src, T * dst) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + dst[i] += src[i]; +} + void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) { auto op = (ggml_op)dst->op_params[0]; @@ -19,8 +28,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ //printf("============================== %s on device %d with %d sources\n", __func__, ctx.device, nreduce); -#ifdef GGML_USE_NCCL auto & info = ggml_cuda_info(); +#ifdef GGML_USE_NCCL GGML_ASSERT(info.have_nccl); GGML_ASSERT(info.device_count == nreduce); auto type = dst->type; @@ -28,6 +37,38 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ if (nreduce != info.device_count) { GGML_ABORT("Not implemented"); } + //auto tim1 = std::chrono::steady_clock::now(); + if (nreduce == 4) { + auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf; + if (dst->ne[1] > 32) { + static const int devs[8] = {0,1, 2,3, 0,2, 1,3}; + for (int ip = 0; ip < 4; ++ip) { + ncclGroupStart(); + auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data, + ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream()); + auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data, + ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream()); + ncclGroupEnd(); + if (status1 != ncclSuccess || status2 != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2); + GGML_ABORT("Fatal error"); + } + } + } else { + ncclGroupStart(); + for (int i = 0; i < nreduce; ++i) { + auto stream = info.all_ctx[i]->stream(); + GGML_ASSERT(stream); + auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum, + info.nccl_coms[8+i], stream); + if (status != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status); + GGML_ABORT("Fatal error"); + } + } + ncclGroupEnd(); + } + } else { ncclGroupStart(); for (int i = 0; i < nreduce; ++i) { ncclComm_t this_comm; @@ -57,8 +98,55 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ } } ncclGroupEnd(); + } + //auto tim2 = std::chrono::steady_clock::now(); + //printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast(tim2-tim1).count()); return; #endif - fprintf(stderr, "%s: not implemented without NCCL\n", __func__); - GGML_ABORT("Fatal error"); + //auto tim1 = std::chrono::steady_clock::now(); + //GGML_ASSERT(nhave == nreduce); + //auto nbytes = ggml_nbytes(dst); + //ggml_cuda_pool_alloc buffer(ctx.pool(), nbytes*(nhave-1)); + //auto ptr = buffer.ptr; + //for (int i = 0; i < nhave; ++i) { + // GGML_ASSERT(dst->src[i]->type == dst->type); + // GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); + // if (i == ctx.device) continue; + // ggml_cuda_set_device(i); + // CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream())); + // if (!info.all_ctx[i]->copy_event) { + // CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + // } + // CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + // ptr += nbytes; + //} + //auto nelem = ggml_nelements(dst); + //int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; + //ggml_cuda_set_device(ctx.device); + //ptr = buffer.ptr; + //for (int i = 0; i < nhave; ++i) { + // if (i == ctx.device) continue; + // CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); + // if (dst->type == GGML_TYPE_F16) { + // k_add<<>>(nelem, (const half *)ptr, (half *)dst->src[nhave-1]->data); + // } else { + // k_add<<>>(nelem, (const float *)ptr, (float *)dst->src[nhave-1]->data); + // } + // ptr += nbytes; + //} + //if (!ctx.copy_event) { + // CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming)); + //} + //CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream())); + //for (int i = 0; i < nhave; ++i) { + // if (i == ctx.device) continue; + // ggml_cuda_set_device(i); + // CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0)); + // CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->src[nhave-1]->data, ctx.device, nbytes, info.all_ctx[i]->stream())); + //} + //ggml_cuda_set_device(ctx.device); + //auto tim2 = std::chrono::steady_clock::now(); + //printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast(tim2-tim1).count()); + //fprintf(stderr, "%s: not implemented without NCCL\n", __func__); + //GGML_ABORT("Fatal error"); }