mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
WIP
This commit is contained in:
@@ -479,6 +479,11 @@ static std::atomic<int> ggml_cuda_lock_counter;
|
|||||||
|
|
||||||
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
|
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
|
||||||
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
|
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||||
|
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||||
|
if (info->all_ctx[device]) {
|
||||||
|
GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
|
||||||
|
}
|
||||||
|
info->all_ctx[device] = this;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||||
@@ -486,6 +491,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
|||||||
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
|
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
|
||||||
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
|
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
|
||||||
|
|
||||||
|
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||||
|
info->all_ctx[this->device] = nullptr;
|
||||||
|
|
||||||
if (copy_event != nullptr) {
|
if (copy_event != nullptr) {
|
||||||
CUDA_CHECK(cudaEventDestroy(copy_event));
|
CUDA_CHECK(cudaEventDestroy(copy_event));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -742,6 +742,8 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K_R4> {
|
|||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
||||||
|
struct ggml_backend_cuda_context;
|
||||||
|
|
||||||
struct ggml_cuda_device_info {
|
struct ggml_cuda_device_info {
|
||||||
int device_count;
|
int device_count;
|
||||||
|
|
||||||
@@ -760,6 +762,7 @@ struct ggml_cuda_device_info {
|
|||||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||||
|
|
||||||
#ifdef GGML_USE_NCCL
|
#ifdef GGML_USE_NCCL
|
||||||
|
ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
||||||
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
|
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
|
||||||
bool have_nccl;
|
bool have_nccl;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
64
ggml/src/ggml-cuda/reduce.cu
Normal file
64
ggml/src/ggml-cuda/reduce.cu
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
//
|
||||||
|
// Copyright (C) 2023-2024 The ggml authors
|
||||||
|
// Copyright (C) 2024 Iwan Kawrakow
|
||||||
|
// MIT license
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "reduce.cuh"
|
||||||
|
|
||||||
|
void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
||||||
|
auto op = (ggml_op)dst->op_params[0];
|
||||||
|
GGML_ASSERT(op == GGML_OP_ADD);
|
||||||
|
int nreduce = dst->op_params[1];
|
||||||
|
int nhave = dst->op_params[2];
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
GGML_ASSERT(nhave >=2 && nhave <= nreduce);
|
||||||
|
|
||||||
|
//printf("============================== %s on device %d\n", __func__, ctx.device);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_NCCL
|
||||||
|
auto & info = ggml_cuda_info();
|
||||||
|
GGML_ASSERT(info.have_nccl);
|
||||||
|
GGML_ASSERT(info.device_count == nreduce);
|
||||||
|
auto type = dst->type;
|
||||||
|
//int device = ctx.device;
|
||||||
|
if (nreduce != info.device_count) {
|
||||||
|
GGML_ABORT("Not implemented");
|
||||||
|
}
|
||||||
|
ncclGroupStart();
|
||||||
|
for (int i = 0; i < nreduce; ++i) {
|
||||||
|
ncclComm_t this_comm;
|
||||||
|
if (nhave == nreduce) {
|
||||||
|
this_comm = info.nccl_coms[i];
|
||||||
|
} else {
|
||||||
|
auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 1 : 0, i, &this_comm, NULL);
|
||||||
|
GGML_ASSERT(status == ncclSuccess);
|
||||||
|
}
|
||||||
|
auto stream = info.all_ctx[i]->stream();
|
||||||
|
GGML_ASSERT(stream);
|
||||||
|
ncclResult_t status;
|
||||||
|
if (type == GGML_TYPE_F32) {
|
||||||
|
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||||
|
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||||
|
ggml_nelements(dst),
|
||||||
|
ncclFloat, ncclSum, this_comm, stream);
|
||||||
|
} else {
|
||||||
|
status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||||
|
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||||
|
ggml_nelements(dst),
|
||||||
|
ncclHalf, ncclSum, this_comm, stream);
|
||||||
|
}
|
||||||
|
if (status != ncclSuccess) {
|
||||||
|
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
|
||||||
|
GGML_ABORT("Fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ncclGroupEnd();
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
|
||||||
|
GGML_ABORT("Fatal error");
|
||||||
|
}
|
||||||
7
ggml/src/ggml-cuda/reduce.cuh
Normal file
7
ggml/src/ggml-cuda/reduce.cuh
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_REDUCE_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_cuda_op_fake_cpy(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
Reference in New Issue
Block a user