mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-02 18:10:02 +00:00
WIP
This commit is contained in:
@@ -250,47 +250,36 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef GGML_USE_NCCL
|
||||
info.have_nccl = false;
|
||||
if (info.device_count > 1) {
|
||||
if (info.device_count == 4) {
|
||||
int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
int gpus[4] = {0, 1, 2, 3};
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+8, 4, gpus); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
info.have_nccl = true;
|
||||
printf("=============================== NCCL initialized\n");
|
||||
} else if (info.device_count == 3) {
|
||||
int devs[4] = {0,1, 0,2};
|
||||
for (int ip = 0; ip < 2; ++ip) {
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+2*ip, 2, devs+2*ip); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
int gpus[3] = {0, 1, 2};
|
||||
if (auto status = ncclCommInitAll(info.nccl_coms+4, 3, gpus); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of 4 GPUs failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
info.have_nccl = true;
|
||||
printf("=============================== NCCL initialized\n");
|
||||
} else {
|
||||
int gpu_list[GGML_CUDA_MAX_DEVICES];
|
||||
for(int i = 0; i < info.device_count; ++i) gpu_list[i] = i;
|
||||
auto status = ncclCommInitAll(info.nccl_coms, info.device_count, gpu_list);
|
||||
if (status == ncclSuccess) {
|
||||
printf("=============================== NCCL initialized\n");
|
||||
printf("=============================== NCCL main communicator initialized\n");
|
||||
info.have_nccl = true;
|
||||
} else {
|
||||
printf("=============================== NCCL initialization failed with status %d\n", int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto com = info.nccl_coms + info.device_count;
|
||||
if (info.device_count == 4) {
|
||||
int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
auto com = info.nccl_coms + info.device_count;
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
|
||||
} else if (info.device_count == 3) {
|
||||
int devs[4] = {0,1, 0,2};
|
||||
for (int ip = 0; ip < 2; ++ip) {
|
||||
if (auto status = ncclCommInitAll(com+2*ip, 2, devs+2*ip); status != ncclSuccess) {
|
||||
printf("=============================== NCCL initialization of pair %d failed with status %d\n", ip, int(status));
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
printf("=============================== NCCL pair communicators for %d GPUs initialized\n", info.device_count);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -3512,8 +3501,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
needs_f16_f32_copy = true;
|
||||
|
||||
} else {
|
||||
#ifdef GGML_USE_NCCL__
|
||||
auto & info = ggml_cuda_info();
|
||||
auto nbytes = ggml_nbytes(src);
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
|
||||
info.all_ctx[cuda_ctx_src->device]->stream());
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
|
||||
info.all_ctx[cuda_ctx_dst->device]->stream());
|
||||
ncclGroupEnd();
|
||||
GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
|
||||
return true;
|
||||
#else
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -4435,6 +4439,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] con
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (!enable_p2p) {
|
||||
printf("================== P2P disabled, but needed for NCCL\n");
|
||||
enable_p2p = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_CUDA_NO_PEER_COPY)
|
||||
if (enable_p2p) {
|
||||
ggml_cuda_set_peer_access(device);
|
||||
|
||||
@@ -38,114 +38,84 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ABORT("Not implemented");
|
||||
}
|
||||
//auto tim1 = std::chrono::steady_clock::now();
|
||||
if (nreduce == 4) {
|
||||
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
|
||||
if (dst->ne[1] > 32) {
|
||||
static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(devs[2*ip+0]);
|
||||
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
|
||||
ggml_cuda_set_device(devs[2*ip+1]);
|
||||
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
|
||||
if (nreduce == 4 && dst->ne[1] > 32) {
|
||||
auto com = info.nccl_coms + info.device_count;
|
||||
static const int devs[8] = {0,1, 2,3, 0,2, 1,3};
|
||||
for (int ip = 0; ip < 4; ++ip) {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
ggml_cuda_set_device(i);
|
||||
auto stream = info.all_ctx[i]->stream();
|
||||
GGML_ASSERT(stream);
|
||||
auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum,
|
||||
info.nccl_coms[8+i], stream);
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
}
|
||||
}
|
||||
else if (nreduce == 3) {
|
||||
auto data_type = type == GGML_TYPE_F32 ? ncclFloat : ncclHalf;
|
||||
if (dst->ne[1] > 32) {
|
||||
static const int devs[4] = {0,1, 0,2};
|
||||
for (int ip = 0; ip < 2; ++ip) {
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(devs[2*ip+0]);
|
||||
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
|
||||
ggml_cuda_set_device(devs[2*ip+1]);
|
||||
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(0);
|
||||
auto status1 = ncclSend(dst->src[0]->data, ggml_nelements(dst), data_type, 1, info.nccl_coms[0], info.all_ctx[0]->stream());
|
||||
ggml_cuda_set_device(1);
|
||||
auto status2 = ncclRecv(dst->src[1]->data, ggml_nelements(dst), data_type, 0, info.nccl_coms[1], info.all_ctx[1]->stream());
|
||||
ggml_cuda_set_device(devs[2*ip+0]);
|
||||
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
|
||||
ggml_cuda_set_device(devs[2*ip+1]);
|
||||
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclSend/Recv failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
} else {
|
||||
}
|
||||
}
|
||||
else if (nreduce == 3 && dst->ne[1] > 32) {
|
||||
auto com = info.nccl_coms + info.device_count;
|
||||
static const int devs[4] = {0,1, 0,2};
|
||||
for (int ip = 0; ip < 2; ++ip) {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
ggml_cuda_set_device(i);
|
||||
auto stream = info.all_ctx[i]->stream();
|
||||
GGML_ASSERT(stream);
|
||||
auto status = ncclAllReduce(dst->src[i]->data, dst->src[i]->data, ggml_nelements(dst), data_type, ncclSum,
|
||||
info.nccl_coms[4+i], stream);
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce on device %d failed with status %d\n", __func__, i, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ggml_cuda_set_device(devs[2*ip+0]);
|
||||
auto status1 = ncclAllReduce(dst->src[devs[2*ip+0]]->data, dst->src[devs[2*ip+0]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, com[2*ip+0], info.all_ctx[devs[2*ip+0]]->stream());
|
||||
ggml_cuda_set_device(devs[2*ip+1]);
|
||||
auto status2 = ncclAllReduce(dst->src[devs[2*ip+1]]->data, dst->src[devs[2*ip+1]]->data,
|
||||
ggml_nelements(dst), data_type, ncclSum, com[2*ip+1], info.all_ctx[devs[2*ip+1]]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
ncclComm_t this_comm;
|
||||
if (nhave == nreduce) {
|
||||
this_comm = info.nccl_coms[i];
|
||||
} else {
|
||||
auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 1 : 0, i, &this_comm, NULL);
|
||||
GGML_ASSERT(status == ncclSuccess);
|
||||
}
|
||||
ggml_cuda_set_device(i);
|
||||
auto stream = info.all_ctx[i]->stream();
|
||||
GGML_ASSERT(stream);
|
||||
ncclResult_t status;
|
||||
if (type == GGML_TYPE_F32) {
|
||||
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst),
|
||||
ncclFloat, ncclSum, this_comm, stream);
|
||||
} else {
|
||||
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst),
|
||||
ncclHalf, ncclSum, this_comm, stream);
|
||||
}
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(0);
|
||||
auto status1 = ncclSend(dst->src[0]->data, ggml_nelements(dst), data_type, 1, info.nccl_coms[0], info.all_ctx[0]->stream());
|
||||
ggml_cuda_set_device(1);
|
||||
auto status2 = ncclRecv(dst->src[1]->data, ggml_nelements(dst), data_type, 0, info.nccl_coms[1], info.all_ctx[1]->stream());
|
||||
ncclGroupEnd();
|
||||
if (status1 != ncclSuccess || status2 != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclSend/Recv failed with statuses %d, %d\n", __func__, (int)status1, (int)status2);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
else {
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < nreduce; ++i) {
|
||||
ncclComm_t this_comm;
|
||||
if (nhave == nreduce) {
|
||||
this_comm = info.nccl_coms[i];
|
||||
} else {
|
||||
auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 1 : 0, i, &this_comm, NULL);
|
||||
GGML_ASSERT(status == ncclSuccess);
|
||||
}
|
||||
ggml_cuda_set_device(i);
|
||||
auto stream = info.all_ctx[i]->stream();
|
||||
GGML_ASSERT(stream);
|
||||
ncclResult_t status;
|
||||
if (type == GGML_TYPE_F32) {
|
||||
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst),
|
||||
ncclFloat, ncclSum, this_comm, stream);
|
||||
} else {
|
||||
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst),
|
||||
ncclHalf, ncclSum, this_comm, stream);
|
||||
}
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
//auto tim2 = std::chrono::steady_clock::now();
|
||||
|
||||
Reference in New Issue
Block a user