diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index f40c8df8..a2f9716c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -479,6 +479,11 @@ static std::atomic ggml_cuda_lock_counter; ggml_backend_cuda_context::ggml_backend_cuda_context(int device) : device(device), name(GGML_CUDA_NAME + std::to_string(device)) { + auto info = const_cast(&ggml_cuda_info()); + if (info->all_ctx[device]) { + GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device); + } + info->all_ctx[device] = this; } ggml_backend_cuda_context::~ggml_backend_cuda_context() { @@ -486,6 +491,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { std::unique_lock lock(ggml_cuda_lock); ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; }); + auto info = const_cast(&ggml_cuda_info()); + info->all_ctx[this->device] = nullptr; + if (copy_event != nullptr) { CUDA_CHECK(cudaEventDestroy(copy_event)); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 6c852807..86b52440 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -742,6 +742,8 @@ struct ggml_cuda_type_traits { ////////////////////// +struct ggml_backend_cuda_context; + struct ggml_cuda_device_info { int device_count; @@ -760,6 +762,7 @@ struct ggml_cuda_device_info { std::array default_tensor_split = {}; #ifdef GGML_USE_NCCL + ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr }; ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES]; bool have_nccl; #endif diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu new file mode 100644 index 00000000..c09d2903 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cu @@ -0,0 +1,64 @@ +// +// Copyright (C) 2023-2024 The ggml authors +// Copyright (C) 2024 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include "reduce.cuh" + +void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + + auto op = (ggml_op)dst->op_params[0]; + GGML_ASSERT(op == GGML_OP_ADD); + int nreduce = dst->op_params[1]; + int nhave = dst->op_params[2]; + GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(nhave >=2 && nhave <= nreduce); + + //printf("============================== %s on device %d\n", __func__, ctx.device); + +#ifdef GGML_USE_NCCL + auto & info = ggml_cuda_info(); + GGML_ASSERT(info.have_nccl); + GGML_ASSERT(info.device_count == nreduce); + auto type = dst->type; + //int device = ctx.device; + if (nreduce != info.device_count) { + GGML_ABORT("Not implemented"); + } + ncclGroupStart(); + for (int i = 0; i < nreduce; ++i) { + ncclComm_t this_comm; + if (nhave == nreduce) { + this_comm = info.nccl_coms[i]; + } else { + auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 1 : 0, i, &this_comm, NULL); + GGML_ASSERT(status == ncclSuccess); + } + auto stream = info.all_ctx[i]->stream(); + GGML_ASSERT(stream); + ncclResult_t status; + if (type == GGML_TYPE_F32) { + status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr, + dst->src[i] ? dst->src[i]->data : nullptr, + ggml_nelements(dst), + ncclFloat, ncclSum, this_comm, stream); + } else { + status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr, + dst->src[i] ? dst->src[i]->data : nullptr, + ggml_nelements(dst), + ncclHalf, ncclSum, this_comm, stream); + } + if (status != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status); + GGML_ABORT("Fatal error"); + } + } + ncclGroupEnd(); + return; +#endif + fprintf(stderr, "%s: not implemented without NCCL\n", __func__); + GGML_ABORT("Fatal error"); +} diff --git a/ggml/src/ggml-cuda/reduce.cuh b/ggml/src/ggml-cuda/reduce.cuh new file mode 100644 index 00000000..2338c2c6 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cuh @@ -0,0 +1,7 @@ +#include "common.cuh" + +#define CUDA_REDUCE_BLOCK_SIZE 256 + +void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_fake_cpy(ggml_backend_cuda_context & ctx, ggml_tensor * dst);