mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-11 14:30:02 +00:00
WIP: fast PP with bespoke 4-GPU NCCL
I guess, I'm not using NCCL the right way as PP is very low with a single communicator group for 3 or more GPUs. But if I create 4 communicator groups for pairs of GPUs (0,1, 2,3, 0,2, 1,3) and use that, PP is fast: I'm hitting 1500 t/s for L3-70B on the 4x3090 system, which is ~20% better than the previous sm graph without NCCL. But that cannot be the solution (I cannot be creating pairwise communicators and associated logic for every possible number of GPUs).
This commit is contained in:
@@ -250,6 +250,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef GGML_USE_NCCL
|
||||
info.have_nccl = false;
|
||||
if (info.device_count > 1) {
|
||||
if (info.device_count == 4) {
|
||||
int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
int gpus[4] = {0, 1, 2, 3};
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+8, 4, gpus); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
info.have_nccl = true;
|
||||
printf("=============================== NCCL initialized\n");
|
||||
} else {
|
||||
int gpu_list[GGML_CUDA_MAX_DEVICES];
|
||||
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
|
||||
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
|
||||
@@ -258,6 +274,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.have_nccl = true;
|
||||
} else {
|
||||
printf("=============================== NCCL initialization failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,6 +7,15 @@
|
||||
|
||||
#include "reduce.cuh"
|
||||
|
||||
#include <chrono>
|
||||
|
||||
template <typename T, int block_size>
|
||||
static __global__ void k_add(int nelem, const T * src, T * dst) {
|
||||
int i = blockIdx.x*block_size + threadIdx.x;
|
||||
if (i >= nelem) return;
|
||||
dst[i] += src[i];
|
||||
}
|
||||
|
||||
void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
auto op = (ggml_op)dst->op_params[0];
|
||||
@@ -19,8 +28,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
|
||||
//printf("============================== %s on device %d with %d sources\n", __func__, ctx.device, nreduce);
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
auto & info = ggml_cuda_info();
|
||||
#ifdef GGML_USE_NCCL
|
||||
GGML_ASSERT(info.have_nccl);
|
||||
GGML_ASSERT(info.device_count == nreduce);
|
||||
auto type = dst->type;
|
||||
@@ -28,6 +37,38 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
if (nreduce != info.device_count) {
|
||||
GGML_ABORT("Not implemented");
|
||||
}
|
||||
//auto tim1 = std::chrono::steady_clock::now();
|
||||
if (nreduce == 4) {
|
||||
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
|
||||
if (dst->ne[1] > 32) {
|
||||
static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
ncclGroupStart();
|
||||
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
|
||||
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
auto stream = info.all_ctx[i]->stream();
|
||||
GGML_ASSERT(stream);
|
||||
auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum,
|
||||
info.nccl_coms[8+i], stream);
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
}
|
||||
} else {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
ncclComm_t this_comm;
|
||||
@@ -57,8 +98,55 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
}
|
||||
//auto tim2 = std::chrono::steady_clock::now();
|
||||
//printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
|
||||
return;
|
||||
#endif
|
||||
fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
|
||||
GGML_ABORT("Fatal error");
|
||||
//auto tim1 = std::chrono::steady_clock::now();
|
||||
//GGML_ASSERT(nhave == nreduce);
|
||||
//auto nbytes = ggml_nbytes(dst);
|
||||
//ggml_cuda_pool_alloc<char> buffer(ctx.pool(), nbytes*(nhave-1));
|
||||
//auto ptr = buffer.ptr;
|
||||
//for (int i = 0; i < nhave; ++i) {
|
||||
// GGML_ASSERT(dst->src[i]->type == dst->type);
|
||||
// GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
// if (i == ctx.device) continue;
|
||||
// ggml_cuda_set_device(i);
|
||||
// CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
|
||||
// if (!info.all_ctx[i]->copy_event) {
|
||||
// CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
// }
|
||||
// CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
// ptr += nbytes;
|
||||
//}
|
||||
//auto nelem = ggml_nelements(dst);
|
||||
//int num_blocks = (nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
|
||||
//ggml_cuda_set_device(ctx.device);
|
||||
//ptr = buffer.ptr;
|
||||
//for (int i = 0; i < nhave; ++i) {
|
||||
// if (i == ctx.device) continue;
|
||||
// CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
|
||||
// if (dst->type == GGML_TYPE_F16) {
|
||||
// k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->src[nhave-1]->data);
|
||||
// } else {
|
||||
// k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const float *)ptr, (float *)dst->src[nhave-1]->data);
|
||||
// }
|
||||
// ptr += nbytes;
|
||||
//}
|
||||
//if (!ctx.copy_event) {
|
||||
// CUDA_CHECK(cudaEventCreateWithFlags(&ctx.copy_event, cudaEventDisableTiming));
|
||||
//}
|
||||
//CUDA_CHECK(cudaEventRecord(ctx.copy_event, ctx.stream()));
|
||||
//for (int i = 0; i < nhave; ++i) {
|
||||
// if (i == ctx.device) continue;
|
||||
// ggml_cuda_set_device(i);
|
||||
// CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
|
||||
// CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->src[nhave-1]->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
|
||||
//}
|
||||
//ggml_cuda_set_device(ctx.device);
|
||||
//auto tim2 = std::chrono::steady_clock::now();
|
||||
//printf("%s: launched in %g us\n", __func__, 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(tim2-tim1).count());
|
||||
//fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
|
||||
//GGML_ABORT("Fatal error");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user