WIP: fast PP with bespoke 4-GPU NCCL

I guess, I'm not using NCCL the right way as PP is very
low with a single communicator group for 3 or more GPUs.
But if I create 4 communicator groups for pairs of GPUs
(0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting
1500 t/s for L3-70B on the 4x3090 system, which is
~20% better than the previous sm graph without NCCL.
But that cannot be the solution (I cannot be creating pairwise
communicators and associated logic for every possible number of GPUs).
This commit is contained in:
Iwan Kawrakow
2025-12-22 07:22:29 +00:00
parent 297f82ed02
commit d50ef0165e
2 changed files with 109 additions and 3 deletions

View File

@@ -250,6 +250,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
#ifdef GGML_USE_NCCL
info.have_nccl = false;
if (info.device_count > 1) {
if (info.device_count == 4) {
int devs[8] = {0,1, 2,3, 0,2, 1,3};
for (int ip = 0; ip < 4; ++ip) {
if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) {
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
GGML_ABORT("Fatal error");
}
}
int gpus[4] = {0, 1, 2, 3};
if (auto status = ncclCommInitAll(info.nccl_coms+8, 4, gpus); status != ncclSuccess) {
printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status));
GGML_ABORT("Fatal error");
}
info.have_nccl = true;
printf("=============================== NCCL initialized\n");
} else {
int gpu_list[GGML_CUDA_MAX_DEVICES];
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
@@ -258,6 +274,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.have_nccl = true;
} else {
printf("=============================== NCCL initialization failed with status %d\n", int(status));
GGML_ABORT("Fatal error");
}
}
}
#endif

View File

@@ -7,6 +7,15 @@
#include "reduce.cuh"
#include <chrono>
template <typename T, int block_size>
static __global__ void k_add(int nelem, const T * src, T * dst) {
int i = blockIdx.x*block_size + threadIdx.x;
if (i >= nelem) return;
dst[i] += src[i];
}
void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
auto op = (ggml_op)dst->op_params[0];
@@ -19,8 +28,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
//printf("============================== %s on device %d with %d sources\n", __func__, ctx.device, nreduce);
#ifdef GGML_USE_NCCL
auto & info = ggml_cuda_info();
#ifdef GGML_USE_NCCL
GGML_ASSERT(info.have_nccl);
GGML_ASSERT(info.device_count == nreduce);
auto type = dst->type;
@@ -28,6 +37,38 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
if (nreduce != info.device_count) {
GGML_ABORT("Not implemented");
}
//auto tim1 = std::chrono::steady_clock::now();
if (nreduce == 4) {
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
if (dst->ne[1] > 32) {
static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
for (int ip = 0; ip < 4; ++ip) {
ncclGroupStart();
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
ncclGroupEnd();
if (status1 != ncclSuccess || status2 != ncclSuccess) {
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
GGML_ABORT("Fatal error");
}
}
} else {
ncclGroupStart();
for (int i = 0; i < nreduce; ++i) {
auto stream = info.all_ctx[i]->stream();
GGML_ASSERT(stream);
auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum,
info.nccl_coms[8+i], stream);
if (status != ncclSuccess) {
fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status);
GGML_ABORT("Fatal error");
}
}
ncclGroupEnd();
}
} else {
ncclGroupStart();
for (int i = 0; i < nreduce; ++i) {
ncclComm_t this_comm;
@@ -57,8 +98,55 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
}
}
ncclGroupEnd();
}
//auto tim2 = std::chrono::steady_clock::now();
//printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
return;
#endif
fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
GGML_ABORT("Fatal error");
//auto tim1 = std::chrono::steady_clock::now();
//GGML_ASSERT(nhave == nreduce);
//auto nbytes = ggml_nbytes(dst);
//ggml_cuda_pool_alloc<char> buffer(ctx.pool(), nbytes*(nhave-1));
//auto ptr = buffer.ptr;
//for (int i = 0; i < nhave; ++i) {
// GGML_ASSERT(dst->src[i]->type == dst->type);
// GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
// if (i == ctx.device) continue;
// ggml_cuda_set_device(i);
// CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
// if (!info.all_ctx[i]->copy_event) {
// CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
// }
// CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
// ptr += nbytes;
//}
//auto nelem = ggml_nelements(dst);
//int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
//ggml_cuda_set_device(ctx.device);
//ptr = buffer.ptr;
//for (int i = 0; i < nhave; ++i) {
// if (i == ctx.device) continue;
// CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
// if (dst->type == GGML_TYPE_F16) {
// k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->src[nhave-1]->data);
// } else {
// k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const float *)ptr, (float *)dst->src[nhave-1]->data);
// }
// ptr += nbytes;
//}
//if (!ctx.copy_event) {
// CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming));
//}
//CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream()));
//for (int i = 0; i < nhave; ++i) {
// if (i == ctx.device) continue;
// ggml_cuda_set_device(i);
// CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
// CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->src[nhave-1]->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
//}
//ggml_cuda_set_device(ctx.device);
//auto tim2 = std::chrono::steady_clock::now();
//printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
//fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
//GGML_ABORT("Fatal error");
}