mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Another attempt to fix the illegal memory access bug
This commit is contained in:
@@ -20,6 +20,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// TODO
|
||||
//if (size == 0) {
|
||||
// // return a dummy buffer for zero-sized allocations
|
||||
// return ggml_backend_buffer_init(buft, NULL, NULL, 0);
|
||||
//}
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@@ -92,6 +97,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
}
|
||||
|
||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
// get_base is optional if the buffer is zero-sized
|
||||
if (buffer->size == 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -222,14 +232,14 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
||||
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
if (!size) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
|
||||
#if IK_PRINT_TIMING
|
||||
int64_t tim1 = ggml_time_us();
|
||||
@@ -243,16 +253,18 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
||||
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor);
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
|
||||
if (!size) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if IK_PRINT_TIMING
|
||||
int64_t tim1 = ggml_time_us();
|
||||
@@ -1148,7 +1160,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
||||
}
|
||||
|
||||
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
if (buffer == NULL) {
|
||||
return -1;
|
||||
}
|
||||
@@ -1198,6 +1210,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
|
||||
}
|
||||
|
||||
// graph input
|
||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||
@@ -1679,6 +1697,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
// mainline: int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
if (sched->graph.size < graph_size) {
|
||||
sched->graph.size = graph_size;
|
||||
@@ -1998,12 +2017,14 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
//ggml_backend_sched_synchronize(sched);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2440,6 +2440,14 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
const ggml_tensor * src1 = dst->src[2];
|
||||
const ggml_tensor * ids = dst->src[3];
|
||||
|
||||
int device_id = ctx.device;
|
||||
bool can_fuse_down = next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
|
||||
ggml_backend_buffer_is_cuda(next->src[0]->buffer) &&
|
||||
!ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) &&
|
||||
((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id &&
|
||||
ggml_backend_buffer_is_cuda(next->buffer) &&
|
||||
((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id;
|
||||
|
||||
if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
||||
ggml_is_quantized(src0_1->type) &&
|
||||
ggml_is_quantized(src0_2->type) &&
|
||||
@@ -2450,7 +2458,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
!ggml_backend_buffer_is_cuda_split(src0_1->buffer) &&
|
||||
!ggml_backend_buffer_is_cuda_split(src0_2->buffer) &&
|
||||
src1->type == GGML_TYPE_F32) {
|
||||
int device_id = ctx.device;
|
||||
ggml_backend_cuda_buffer_context * src0_1_ctx = (ggml_backend_cuda_buffer_context *) src0_1->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * src0_2_ctx = (ggml_backend_cuda_buffer_context *) src0_2->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
|
||||
@@ -2500,12 +2507,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
0, src0_2->ne[1], 1, src1_padded_col_size, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
|
||||
ggml_backend_buffer_is_cuda(next->src[0]->buffer) &&
|
||||
!ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) &&
|
||||
((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id &&
|
||||
ggml_backend_buffer_is_cuda(next->buffer) &&
|
||||
((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id) {
|
||||
if (can_fuse_down) {
|
||||
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst->ne[0]*n_ids,
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
|
||||
@@ -2599,7 +2601,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
bool fuse_down = false;
|
||||
if (next && next->op == GGML_OP_MUL_MAT_ID) {
|
||||
if (can_fuse_down) {
|
||||
//printf("Fusing MoE down gemm\n");
|
||||
fuse_down = true;
|
||||
final_dst = *next;
|
||||
|
||||
Reference in New Issue
Block a user