diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index be8d6fc0..11891ea2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1068,16 +1068,6 @@ extern "C" { struct ggml_tensor * a, enum ggml_op op); - GGML_API struct ggml_tensor * ggml_reduce( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_op op); - - GGML_API struct ggml_tensor * ggml_reduce_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - enum ggml_op op); - GGML_API struct ggml_tensor * ggml_add_cast( struct ggml_context * ctx, struct ggml_tensor * a, @@ -3058,6 +3048,18 @@ extern "C" { int split_dim, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_reduce( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_split_tensor_t * b, + enum ggml_op op); + + GGML_API struct ggml_tensor * ggml_reduce_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_split_tensor_t * b, + enum ggml_op op); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 30db6939..8d97de48 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #define IK_PRINT_TIMING 0 @@ -1421,6 +1422,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); + if (node->op == GGML_OP_REDUCE) { + auto extra = (const ggml_split_tensor_t *)node->extra; + GGML_ASSERT(extra); + for (int j = extra->n_device-1; j >= 0; --j) { + if (extra->splits[j]) { + *node_backend_id = j; break; + } + } + } // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); @@ -1652,6 +1662,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // check if we should start a new split based on the sources of the current node bool need_new_split = false; if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) || + node->op == GGML_OP_REDUCE || node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) { need_new_split = true; } @@ -2184,13 +2195,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + //auto compute = [sched, &needs_sync, &own_cpy] (int my_backend_id) { + struct ggml_backend_sched_split * splits = sched->splits; + //bool is_cpu = ggml_backend_is_cpu(sched->backends[my_backend_id]); std::vector ids; std::vector unique_ids; ggml_tensor * last_ids_tensor = nullptr; for (int i = 0; i < sched->n_splits; i++) { + //printf("Thread %d: split %d\n", my_backend_id, i); #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); #endif @@ -2198,6 +2213,19 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + printf("Split %d on backend %d\n", i, split_backend_id); + + auto node = split->graph.nodes[0]; + //if (node->op == GGML_OP_REDUCE && split_backend_id != my_backend_id && !is_cpu) { + // printf("%s: triggering reduce for %s on backend %d\n", __func__, node->name, my_backend_id); + // auto graph = split->graph; + // graph.n_nodes = 1; + // auto ec = ggml_backend_graph_compute_async(sched->backends[my_backend_id], &graph); + // if (ec != GGML_STATUS_SUCCESS) return ec; + //} + + //if (split_backend_id != my_backend_id) continue; + // copy the input tensors to the split backend ggml_backend_sched_copy_inputs(sched, split, needs_sync, ids, unique_ids, last_ids_tensor); @@ -2257,6 +2285,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s j0 = j1; } } + if (node->op == GGML_OP_REDUCE) { + for (int ib = 0; ib < sched->n_backends; ++ib) { + if (ib != split_backend_id && !ggml_backend_is_cpu(sched->backends[ib])) { + printf("%s: triggering reduce for %s on backend %d\n", __func__, node->name, ib); + auto graph = split->graph; + graph.n_nodes = 1; + auto ec = ggml_backend_graph_compute_async(sched->backends[ib], &graph); + if (ec != GGML_STATUS_SUCCESS) return ec; + } + } + } // record the event of this copy if (split->n_inputs > 0) { @@ -2265,6 +2304,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } + //}; + + //std::vector workers; + //workers.reserve(sched->n_backends); + //for (int i = 0; i < sched->n_backends; ++i) { + // workers.emplace_back(compute, i); + //} + //for (auto & w : workers) w.join(); sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 10505851..adc20abb 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -48,6 +48,7 @@ #include "ggml-cuda/argmax.cuh" #include "ggml-cuda/multiadd.cuh" #include "ggml-cuda/hadamard.cuh" +#include "ggml-cuda/reduce.cuh" #include #include @@ -2948,6 +2949,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg //printf("%4d %s(%s) on device %d. time = %ld\n", i, ggml_op_name(dst->op), dst->name, ctx.device, ggml_time_us()); switch (dst->op) { + case GGML_OP_REDUCE: + ggml_cuda_op_reduce(ctx, dst); + break; case GGML_OP_ARGMAX: ggml_cuda_argmax(ctx, dst); break; @@ -3747,12 +3751,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } #endif #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer); - } - } + //assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + //for (int j = 0; j < GGML_MAX_SRC; j++) { + // if (node->src[j] != nullptr) { + // assert(node->src[j]->buffer); + // } + //} #endif // NDEBUG bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, cgraph, i); @@ -3881,6 +3885,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; switch (op->op) { + case GGML_OP_REDUCE: + return true; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu new file mode 100644 index 00000000..6e40d7d5 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cu @@ -0,0 +1,67 @@ +// +// Copyright (C) 2023-2024 The ggml authors +// Copyright (C) 2024 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include "reduce.cuh" +#ifdef GGML_USE_NCCL +#include +#endif + +void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + + auto op = (ggml_op)dst->op_params[0]; + GGML_ASSERT(op == GGML_OP_ADD); + GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(dst)); + auto extra = (ggml_split_tensor_t *)dst->extra; + GGML_ASSERT(extra && extra->n_device > 1); + + printf("============================== %s on device %d\n", __func__, ctx.device); + +#ifdef GGML_USE_NCCL + auto & info = ggml_cuda_info(); + GGML_ASSERT(info.have_nccl); + GGML_ASSERT(info.device_count >= extra->n_device); + int nhave = 0; + auto type = dst->type; + for (int j = 0; j < extra->n_device; ++j) { + if (extra->splits[j]) { + GGML_ASSERT(extra->splits[j]->type == type); + GGML_ASSERT(ggml_are_same_shape(dst, extra->splits[j])); + ++nhave; + } + } + int device = ctx.device; + ncclComm_t this_comm; + if (nhave == info.device_count) { + this_comm = info.nccl_coms[device]; + } else { + int color = extra->splits[device] ? 1 : 0; + auto status = ncclCommSplit(info.nccl_coms[0], color, ctx.device, &this_comm, nullptr); + GGML_ASSERT(status == ncclSuccess); + } + GGML_ASSERT(this_comm); + ncclResult_t status; + if (type == GGML_TYPE_F32) { + status = ncclAllReduce(extra->splits[device]->data, + extra->splits[device]->data, + ggml_nelements(extra->splits[device]), + ncclFloat, ncclSum, this_comm, ctx.stream()); + } else { + status = ncclAllReduce(extra->splits[device]->data, + extra->splits[device]->data, + ggml_nelements(extra->splits[device]), + ncclHalf, ncclSum, this_comm, ctx.stream()); + } + if (status != ncclSuccess) { + fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status); + GGML_ABORT("Fatal error"); + } + return; +#endif + fprintf(stderr, "%s: not implemented without NCCL\n", __func__); + GGML_ABORT("Fatal error"); +} diff --git a/ggml/src/ggml-cuda/reduce.cuh b/ggml/src/ggml-cuda/reduce.cuh new file mode 100644 index 00000000..f7f59030 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_REDUCE_BLOCK_SIZE 256 + +void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3aa20b48..59855466 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5314,7 +5314,7 @@ ggml_split_tensor_t * ggml_new_split( result->n_device = n_device; result->split_dim = split_dim; result->tensor = tensor; - result->splits = (struct ggml_tensor**)(result->tensor + 1); + result->splits = (struct ggml_tensor**)(&result->splits + 1); for (int i = 0; i < n_device; ++i) { result->splits[i] = NULL; } @@ -6085,34 +6085,46 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_reduce_impl( struct ggml_context * ctx, struct ggml_tensor * a, + ggml_split_tensor_t * extra, enum ggml_op op, bool inplace) { GGML_ASSERT(op == GGML_OP_ADD); // the only op we currently support - GGML_ASSERT(a->extra); - ggml_split_tensor_t * extra = (ggml_split_tensor_t *)a->extra; GGML_ASSERT(extra->n_device > 1); GGML_ASSERT(extra->splits); + int idx[GGML_MAX_SRC]; int nhave = 0; for (int j = 0; j < extra->n_device; ++j) { - if (extra->splits[j]) ++nhave; + if (extra->splits[j]) { + if (nhave == GGML_MAX_SRC) { + GGML_ABORT("Too many tensors to reduce"); + } + idx[nhave++] = j; + } } GGML_ASSERT(nhave > 1); + for (int j = 1; j < nhave; ++j) { + GGML_ASSERT(ggml_are_same_shape(extra->splits[idx[j]], extra->splits[idx[0]])); + } + struct ggml_tensor * result; if (inplace) { result = ggml_view_tensor(ctx, a); - result->src[0] = a; - result->extra = a->extra; + //for (int j = 0; j < nhave; ++j) result->src[j] = extra->splits[idx[j]]; + result->extra = extra; } else { - result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + result = ggml_dup_tensor(ctx, a); ggml_split_tensor_t * new_extra = ggml_new_split(ctx, extra->n_device, extra->split_dim, result); result->extra = new_extra; + //int jj = 0; for (int j = 0; j < extra->n_device; ++j) { if (extra->splits[j]) { new_extra->splits[j] = ggml_dup_tensor(ctx, extra->splits[j]); + //result->src[jj++] = new_extra->splits[j]; } } } + result->src[0] = a; result->op = GGML_OP_REDUCE; result->op_params[0] = (int32_t)op; return result; @@ -6121,15 +6133,17 @@ static struct ggml_tensor * ggml_reduce_impl( struct ggml_tensor * ggml_reduce( struct ggml_context * ctx, struct ggml_tensor * a, + ggml_split_tensor_t * extra, enum ggml_op op) { - return ggml_reduce_impl(ctx, a, op, false); + return ggml_reduce_impl(ctx, a, extra, op, false); } struct ggml_tensor * ggml_reduce_inplace( struct ggml_context * ctx, struct ggml_tensor * a, + ggml_split_tensor_t * extra, enum ggml_op op) { - return ggml_reduce_impl(ctx, a, op, true); + return ggml_reduce_impl(ctx, a, extra, op, true); } @@ -6140,6 +6154,49 @@ static struct ggml_tensor * ggml_add_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { + + //if (a->extra) { + // ggml_split_tensor_t * a_extra = (ggml_split_tensor_t *)a->extra; + // ggml_split_tensor_t * b_extra = (ggml_split_tensor_t *)b->extra; + // GGML_ASSERT(a_extra->n_device > 1); + // if (b_extra) { + // GGML_ASSERT(b_extra->n_device == a_extra->n_device); + // } + // int nhave = 0; + // for (int j = 0; j < a_extra->n_device; ++j) if (a_extra->splits[j]) ++nhave; + // GGML_ASSERT(nhave > 1); + // ggml_split_tensor_t * new_extra = ggml_new_split(ctx, a_extra->n_device, -1, NULL); + // struct ggml_tensor * last = NULL; + // for (int j = 0; j < a_extra->n_device; ++j) { + // if (!a_extra->splits[j]) continue; + // struct ggml_tensor * aj = a_extra->splits[j]; + // struct ggml_tensor * bj = b; + // if (b_extra) { + // GGML_ASSERT(b_extra->splits[j]); + // bj = b_extra->splits[j]; + // } + // GGML_ASSERT(ggml_are_same_shape(aj, bj)); + // GGML_ASSERT(!aj->extra && !bj->extra); + // struct ggml_tensor * abj = inplace ? ggml_view_tensor(ctx, aj) : ggml_dup_tensor(ctx, aj); + // abj->op = GGML_OP_ADD; + // abj->src[0] = aj; + // abj->src[1] = bj; + // abj->src[2] = a; + // abj->src[3] = b; + // new_extra->splits[j] = abj; + // last = abj; + // } + // GGML_ASSERT(last); + // struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + // result->op = GGML_OP_ADD; + // result->src[0] = a; + // result->src[1] = b; + // new_extra->tensor = result; + // result->extra = new_extra; + // ggml_set_input(result); + // return result; + //} + GGML_ASSERT(ggml_can_repeat(b, a)); bool is_node = false; @@ -8495,6 +8552,32 @@ struct ggml_tensor * ggml_get_rows( GGML_ASSERT(b->ne[3] == 1); GGML_ASSERT(b->type == GGML_TYPE_I32); + if (a->extra) { + ggml_split_tensor_t * extra = (ggml_split_tensor_t *)a->extra; + int nhave = 0; + for (int j = 0; j < extra->n_device; ++j) if (extra->splits[j]) ++nhave; + if (nhave > 1) { + ggml_split_tensor_t * new_extra = ggml_new_split(ctx, extra->n_device, -1, NULL); + struct ggml_tensor * last = NULL; + ggml_split_tensor_t * b_extra = (ggml_split_tensor_t *)b->extra; + for (int j = 0; j < extra->n_device; ++j) { + if (!extra->splits[j]) continue; + struct ggml_tensor * aj = extra->splits[j]; + struct ggml_tensor * bj = b_extra && b_extra->splits[j] ? b_extra->splits[j] : b; + GGML_ASSERT(!aj->extra && !bj->extra); + new_extra->splits[j] = ggml_get_rows(ctx, aj, bj); + last = new_extra->splits[j]; + } + struct ggml_tensor * result = ggml_view_tensor(ctx, last); + result->src[0] = a; + result->src[1] = b; + result->op = GGML_OP_GET_ROWS; + new_extra->tensor = result; + result->extra = new_extra; + return result; + } + } + bool is_node = false; if (a->grad || b->grad) { diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 0c0ef048..c781180f 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -650,21 +650,27 @@ ggml_tensor * llm_build_context::llm_build_ffn( GGML_ASSERT(u->n_device == g->n_device && u->n_device == d->n_device); std::vector ffn; ffn.reserve(u->n_device); + auto iextra = (ggml_split_tensor_t *)input->extra; + auto split_result = ggml_new_split(ctx, u->n_device, -1, input); + ggml_tensor * last_result = nullptr; for (int id = 0; id < u->n_device; ++id) { int il_cb = 1000*(id+1) + il; auto split_u = u->splits[id]; auto split_g = g->splits[id]; auto split_d = d->splits[id]; GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d)); + if (iextra) { + GGML_ASSERT((!split_u && !iextra->splits[id]) || (split_u && iextra->splits[id])); + } if (!split_u) continue; - auto cur = input; + auto cur = iextra ? iextra->splits[id] : input; if (ffn_norm && ffn_norm->extra) { auto norm = (ggml_split_tensor_t *)ffn_norm->extra; GGML_ASSERT(norm->splits[id]); - cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); + cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_inp_normed", il_cb); } - else if (input->type != GGML_TYPE_F32) { + if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx, input, GGML_TYPE_F32); } cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); @@ -677,12 +683,21 @@ ggml_tensor * llm_build_context::llm_build_ffn( } if (cur->ne[1] >= 32) { cur = ggml_cast(ctx, cur, GGML_TYPE_F16); + cb(cur, "ffn_down_f16", il_cb); } if (graph) { ggml_build_forward_expand(graph, cur); } + last_result = cur; + split_result->splits[id] = cur; ffn.push_back(cur); } + GGML_ASSERT(last_result); + auto result = ggml_reduce_inplace(ctx, input, split_result, GGML_OP_ADD); + cb(result, "ffn_out_split", il); + split_result->tensor = result; + return result; + if (ffn.size() == 1) return ffn.front(); auto cur = ggml_add(ctx, ffn[0], ffn[1]); cb(cur, "combine_ffn", il); @@ -1698,6 +1713,28 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml } } } else { + if (cur->extra) { + auto extra = (ggml_split_tensor_t *)cur->extra; + // TODO: get the backend index of the backend where the output tensor is + // and use the corresponding result + //if (output->buffer) { + // auto buft = ggml_backend_buffer_get_type(output->buffer); + // if (buft) { + + // } + //} + //auto backend = ggml_backend_sched_get_tensor_backend(lctx.sched, output); + auto try_cur = extra->splits[lctx.model.main_gpu]; + if (!try_cur) { + for (int i = extra->n_device-1; i >= 0; --i) { + if (extra->splits[i]) { + try_cur = extra->splits[i]; break; + } + } + } + GGML_ASSERT(try_cur); + cur = try_cur; + } if (output_norm) { cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1); cb(cur, "output_normed", -1); @@ -9343,8 +9380,12 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens GGML_ASSERT(bv->n_device == wq->n_device); } std::vector attn; attn.reserve(wq->n_device); + auto iextra = (ggml_split_tensor_t *)input->extra; + auto split_result = ggml_new_split(ctx0, wq->n_device, -1, input); + ggml_tensor * last_result = nullptr; for (int id = 0; id < wq->n_device; ++id) { int il_cb = 1000*(id+1) + il; + split_result->splits[id] = nullptr; auto split_wq = wq->splits[id]; auto split_wk = wk->splits[id]; auto split_wv = wv->splits[id]; @@ -9353,14 +9394,17 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens auto split_vl = vl->splits[id]; GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) || (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl)); + if (iextra) { + GGML_ASSERT((!split_wq && !iextra->splits[id]) || (split_wq && iextra->splits[id])); + } if (!split_wq) continue; - auto cur = input; + auto cur = iextra ? iextra->splits[id] : input; if (attn_norm) { auto split_norm = attn_norm->splits[id]; cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il_cb); } - else if (cur->type != GGML_TYPE_F32) { + if (cur->type != GGML_TYPE_F32) { cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); } auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? @@ -9484,10 +9528,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens } if (cur->ne[1] >= 32) { cur = ggml_cast(ctx0, cur, GGML_TYPE_F16); + cb(cur, "kqv_wo_f16", il_cb); } ggml_build_forward_expand(gf, cur); + last_result = cur; + split_result->splits[id] = cur; attn.push_back(cur); } + GGML_ASSERT(last_result); + auto result = ggml_reduce_inplace(ctx0, input, split_result, GGML_OP_ADD); + cb(result, "attn_out_split", il); + split_result->tensor = result; + return result; + GGML_ASSERT(!attn.empty()); if (attn.size() == 1) return attn.front(); //if (attn.size() > 2 && attn.size()%2 == 0) {