diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 53a36925..c05bf566 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1973,6 +1973,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; ggml_backend_t last_input_backend = nullptr; + bool synced_on_input = false; for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; @@ -1980,10 +1981,14 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done - if (sched->events[split_backend_id][sched->cur_copy] != NULL) { - ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { - ggml_backend_synchronize(split_backend); + // if there are multiple inputs for the split, and we have already synchronized this backend, no need to do it again. + if (!synced_on_input) { + if (sched->events[split_backend_id][sched->cur_copy] != NULL) { + ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); + } else { + ggml_backend_synchronize(split_backend); + } + synced_on_input = true; } ggml_backend_tensor_copy(input, input_cpy); } else { @@ -2162,7 +2167,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s bool work_done = false; #ifdef GGML_USE_OPENMP - //This maty not be available in old OpenMP versions + //This may not be available in old OpenMP versions //if (int nlevels = omp_get_max_active_levels(); nlevels < 2) { // omp_set_max_active_levels(nlevels+1); // //printf("%s: Setting omp max active levels to 2\n", __func__); @@ -2241,6 +2246,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (split->graph.nodes[0]->op == GGML_OP_REDUCE) { last_reduce = split_backend_id; + if (ith == split_backend_id) { + auto node = split->graph.nodes[0]; + int n = node->op_params[1]; + for (int j = 0; j < n; ++j) { + if (node->src[j]) { + sched->needs_sync[j] = false; + } + } + } #pragma omp barrier } @@ -2307,6 +2321,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (split->graph.nodes[0]->op == GGML_OP_REDUCE) { last_reduce = split_backend_id; barrier.arrive_and_wait(); + if (ith == split_backend_id) { + auto node = split->graph.nodes[0]; + int n = node->op_params[1]; + for (int j = 0; j < n; ++j) { + if (node->src[j]) { + sched->needs_sync[j] = false; + } + } + } } //if (needs_barrier) { // barrier.arrive_and_wait(); diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index 18dc8969..23d054b7 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -6,7 +6,6 @@ // #include "reduce.cuh" -#include "binbcast.cuh" #include "ggml-common.h" #include @@ -81,6 +80,35 @@ static __global__ void k_reduce_add_T(copy_task task) { } } +static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor * dst, + int nhave, int ncopy, const int * idx, const int * copy_idx) { + + if (ncopy < 1) return; + + auto & info = ggml_cuda_info(); + auto size = ggml_nbytes(dst); + int isrc = 0; + for (int ii = 0; ii < ncopy; ++ii) { + int i = copy_idx[ii]; + int j = idx[isrc]; + isrc = (isrc + 1)%nhave; + //printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data); + ggml_cuda_set_device(j); + CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device, + size, info.all_ctx[j]->stream())); + CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream())); + } + isrc = 0; + for (int ii = 0; ii < ncopy; ++ii) { + int i = copy_idx[ii]; + int j = idx[isrc]; + isrc = (isrc + 1)%nhave; + ggml_cuda_set_device(i); + CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0)); + } + ggml_cuda_set_device(ctx.device); +} + void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) { auto op = (ggml_op)dst->op_params[0]; @@ -90,7 +118,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_Q8_0 || dst->type == GGML_TYPE_BF16); GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(nhave >=2 && nhave <= nreduce); + GGML_ASSERT(nhave >= 2 && nhave <= nreduce); if (dst->op_params[3] == 1) { // The dst tensor is just a container for the sources and the reduce op is turned off return; @@ -125,13 +153,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(dst->data == dst->src[ctx.device]->data); auto nbytes = ggml_nbytes(dst); int idx[GGML_CUDA_MAX_DEVICES]; + int copy_idx[GGML_CUDA_MAX_DEVICES]; + int ncopy = 0; { int ii = 0; bool have_this_device = false; for (int i = 0; i < nreduce; ++i) { - if (dst->src[i]) { - idx[ii++] = i; - if (i == ctx.device) have_this_device = true; + if (dst->op_params[4] & (1u << i)) { + copy_idx[ncopy++] = i; + } + else { + if (dst->src[i]) { + idx[ii++] = i; + if (i == ctx.device) have_this_device = true; + } } } GGML_ASSERT(ii == nhave); @@ -224,15 +259,6 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ auto nblocks_per_device = (nblocks + nhave - 1)/nhave; auto nelem_per_device = nblocks_per_device * tt.blck_size; auto size_per_device = nblocks_per_device * tt.type_size; - //size_t nelem_per_device, required_size; - //if (dst->type == GGML_TYPE_Q8_0) { - // GGML_ASSERT(nelem % QK8_0 == 0); - // nelem_per_device = QK8_0*((nelem/QK8_0 + nhave - 1)/nhave); - // required_size nelem_per_device/QK8_0 * sizeof(ggml_block_q8_0); - //} - //auto elem_size = ggml_element_size(dst); - //auto nelem_per_device = (nelem + nhave - 1)/nhave; - //auto required_size = nelem_per_device*elem_size; for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; auto this_ctx = info.all_ctx[i]; @@ -277,8 +303,6 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device); ggml_cuda_set_device(info.all_ctx[i]->device); CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0)); - //ggml_op_add_same_type(ctx, dst->type, this_nelem, info.all_ctx[i]->copy_buffer, - // (const char *)dst->src[i]->data + ichunk*size_per_device, (char *)dst->src[i]->data + ichunk*size_per_device); int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; if (dst->type == GGML_TYPE_F16) { k_add<<stream()>>>(this_nelem, @@ -313,8 +337,6 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device, this_size, info.all_ctx[peer]->stream())); CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream())); - //ggml_cuda_set_device(info.all_ctx[i]->device); - //CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0)); ichunk = (ichunk + 1)%nhave; } for (int ii = 0; ii < nhave; ++ii) { @@ -325,6 +347,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ } } ggml_cuda_set_device(ctx.device); + if (ncopy > 0) { + copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx); + } return; } if (false && nhave == 4 && dst->ne[1] <= 8 && ctx.p2p_enabled) { @@ -391,6 +416,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); } ggml_cuda_set_device(ctx.device); + if (ncopy > 0) { + copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx); + } return; } if (dst->ne[1] < 32 && ctx.p2p_enabled) { @@ -474,6 +502,9 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ } } ggml_cuda_set_device(ctx.device); + if (ncopy > 0) { + copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx); + } return; } auto required_size = nbytes*(nhave-1); @@ -537,4 +568,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ if (i == ctx.device) continue; CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); } + if (ncopy > 0) { + copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx); + } } diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index e7d20b21..1aad8c39 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -619,14 +619,19 @@ ggml_tensor * llm_build_context::llm_build_norm( return cur; } -static ggml_tensor * get_input_tensor_sm_graph(ggml_tensor * input, int id) { +static ggml_tensor * get_input_tensor_sm_graph(ggml_context * ctx, ggml_tensor * input, int id) { auto cur = input; if (input->op == GGML_OP_REDUCE) { auto view_src = input->view_src; GGML_ASSERT(view_src); cur = input->src[id]; - if (cur == view_src || !cur) { - //printf("%s: Setting input to %s for id = %d\n", __func__, view_src->name, id); + if (!cur) { + GGML_ASSERT((input->op_params[4] & (1u << id)) == 0); + cur = ggml_dup_tensor(ctx, input); + input->src[id] = cur; + input->op_params[4] |= (1u << id); + } + else if (cur == view_src) { cur = input; } } @@ -693,7 +698,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( auto split_d = d->splits[id]; GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d)); if (!split_u) continue; - auto cur = get_input_tensor_sm_graph(input, id); + auto cur = get_input_tensor_sm_graph(ctx, input, id); cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, is_norm); cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); cb(cur, "ffn_up_gate", il_cb); @@ -1277,17 +1282,8 @@ llm_expert_gating_func_type gating_op, (!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id])); if (!split_up_exps->splits[id]) continue; int il_cb = 1000*(id + 1) + il; - auto cur = get_input_tensor_sm_graph(input, id); + auto cur = get_input_tensor_sm_graph(ctx, input, id); cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, false); - //if (ffn_norm) { - // auto split_ffn_norm = (ggml_split_tensor_t *)ffn_norm->extra; - // GGML_ASSERT(split_ffn_norm && split_ffn_norm->n_device == split_up_exps->n_device); - // cur = llm_build_norm(ctx, cur, lctx.model.hparams, split_ffn_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); - // cb(cur, "ffn_inp_normed", il_cb); - //} - //if (cur->type != GGML_TYPE_F32) { - // cur = ggml_cast(ctx, cur, GGML_TYPE_F32); - //} GGML_ASSERT(!split_gate_inp_b || split_gate_inp_b->splits[id]); GGML_ASSERT(!split_exps_down_b || split_exps_down_b->splits[id]); GGML_ASSERT(!split_exps_gate_b || split_exps_gate_b->splits[id]); @@ -9190,7 +9186,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn && model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) { if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) { - //ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr; auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra; auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra; auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra; @@ -9230,18 +9225,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens GGML_ASSERT((!split_wq && !split_wk && !split_wv && !split_wo && !split_kl && !split_vl) || (split_wq && split_wk && split_wv && split_wo && split_kl && split_vl)); if (!split_wq) continue; - auto cur = get_input_tensor_sm_graph(input, id); + auto cur = get_input_tensor_sm_graph(ctx0, input, id); cur = do_split_norm(ctx0, cur, the_attn_norm, lctx.model.hparams, cb, id, il_cb, is_norm); - //if (attn_norm) { - // if (is_norm) { - // cur = ggml_fused_norm(ctx0, cur, attn_norm->splits[id], lctx.model.hparams.f_norm_eps); - // } else { - // cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il); - // } - //} - //if (cur->type != GGML_TYPE_F32) { - // cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); - //} auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr; auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ?