diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index e191c2d9..82209c7d 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -20,6 +20,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { } GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // TODO + //if (size == 0) { + // // return a dummy buffer for zero-sized allocations + // return ggml_backend_buffer_init(buft, NULL, NULL, 0); + //} return buft->iface.alloc_buffer(buft, size); } @@ -92,6 +97,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { } void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + // get_base is optional if the buffer is zero-sized + if (buffer->size == 0) { + return NULL; + } + void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); @@ -222,14 +232,14 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - if (!size) { return; } + GGML_ASSERT(buf != NULL && "tensor buffer not set"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -243,16 +253,18 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * } -GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (size == 0) { + return; + } + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - if (!size) { - return; - } #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -1148,7 +1160,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen } static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { - ggml_backend_buffer_t buffer = tensor->buffer; + ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (buffer == NULL) { return -1; } @@ -1198,6 +1210,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st } } + if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { + // since the tensor is pre-allocated, it cannot be moved to another backend + ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op)); + } + // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) @@ -1679,6 +1697,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } + // mainline: int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; if (sched->graph.size < graph_size) { sched->graph.size = graph_size; @@ -1998,12 +2017,14 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * ggml_backend_sched_split_graph(sched, measure_graph); + ggml_backend_sched_synchronize(sched); + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } ggml_backend_sched_reset(sched); - ggml_backend_sched_synchronize(sched); + //ggml_backend_sched_synchronize(sched); return true; } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9c8c91f4..6404a281 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2440,6 +2440,14 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor const ggml_tensor * src1 = dst->src[2]; const ggml_tensor * ids = dst->src[3]; + int device_id = ctx.device; + bool can_fuse_down = next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) && + ggml_backend_buffer_is_cuda(next->src[0]->buffer) && + !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) && + ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id && + ggml_backend_buffer_is_cuda(next->buffer) && + ((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id; + if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 && ggml_is_quantized(src0_1->type) && ggml_is_quantized(src0_2->type) && @@ -2450,7 +2458,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor !ggml_backend_buffer_is_cuda_split(src0_1->buffer) && !ggml_backend_buffer_is_cuda_split(src0_2->buffer) && src1->type == GGML_TYPE_F32) { - int device_id = ctx.device; ggml_backend_cuda_buffer_context * src0_1_ctx = (ggml_backend_cuda_buffer_context *) src0_1->buffer->context; ggml_backend_cuda_buffer_context * src0_2_ctx = (ggml_backend_cuda_buffer_context *) src0_2->buffer->context; ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context; @@ -2500,12 +2507,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor 0, src0_2->ne[1], 1, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); - if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) && - ggml_backend_buffer_is_cuda(next->src[0]->buffer) && - !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) && - ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id && - ggml_backend_buffer_is_cuda(next->buffer) && - ((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id) { + if (can_fuse_down) { ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst->ne[0]*n_ids, (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get()); @@ -2599,7 +2601,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor dst_row.nb[3] = nb1; bool fuse_down = false; - if (next && next->op == GGML_OP_MUL_MAT_ID) { + if (can_fuse_down) { //printf("Fusing MoE down gemm\n"); fuse_down = true; final_dst = *next;