diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f7c1593b..15359647 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector unique_ids; ggml_tensor * last_ids_tensor = nullptr; + for (int i = 0; i < sched->n_splits; i++) { + auto split = &splits[i]; + if (split->n_inputs < 1) continue; + int n_host_inputs = 0; + int n_peer_inputs = 0; + for (int j = 0; j < split->n_inputs; ++j) { + if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) { + ++n_host_inputs; + } else { + ++n_peer_inputs; + } + } + if (n_host_inputs == 0 && n_peer_inputs == 1) { + auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy); + printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs, + ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer); + } else { + printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs); + } + } + for (int i = 0; i < sched->n_splits; i++) { #if IK_PRINT_TIMING int64_t tim1 = ggml_time_us(); @@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done + //auto tim1 = ggml_time_us(); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } + //auto tim2 = ggml_time_us(); + //printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name); ggml_backend_tensor_copy(input, input_cpy); } else { // wait for the split backend to finish using the input before overwriting it + //auto tim1 = ggml_time_us(); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } + //auto tim2 = ggml_time_us(); + //printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name); ggml_tensor * node = split->graph.nodes[0]; if (sched->only_active_experts && split->graph.n_nodes > 0 && @@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + //auto tim1 = ggml_time_us(); ggml_backend_synchronize(input_backend); + //auto tim2 = ggml_time_us(); + //printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name); + //tim1 = ggml_time_us(); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); } + //tim2 = ggml_time_us(); + //printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name); ggml_backend_tensor_copy(input, input_cpy); } } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index cd0bf889..74d79670 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ } if (backend_src != backend_dst) { + ggml_cuda_pool_alloc tmp_src(cuda_ctx_src->pool()); + ggml_cuda_pool_alloc tmp_dst(cuda_ctx_dst->pool()); // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); @@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ #ifdef GGML_CUDA_NO_PEER_COPY return false; #else - if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + + if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) { // // The goal here is to reduce traffic between GPU's, which is entirely non-negligible // for prompt processing. @@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ // iBut for some reason the following is not working. // Can somebody tell me why? // - ggml_cuda_pool_alloc tmp_src(cuda_ctx_src->pool(), ggml_nelements(src)); - ggml_cuda_pool_alloc tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst)); + tmp_src.alloc(ggml_nelements(src)); + tmp_dst.alloc(ggml_nelements(dst)); auto src_f16 = *src; src_f16.type = GGML_TYPE_F16; for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2; src_f16.data = tmp_src.get(); - auto dst_f16 = *dst; - dst_f16.type = GGML_TYPE_F16; - for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2; - dst_f16.data = tmp_dst.get(); - ggml_cuda_set_device(cuda_ctx_src->device); ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true); - CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream())); - - CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream())); - - ggml_cuda_set_device(cuda_ctx_dst->device); - CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream())); - ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true); + //printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name); + CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream())); } else { + //if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name); CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); } #endif @@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ // wait on dst stream for the copy to complete CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); + if (tmp_dst.ptr) { + auto dst_f16 = *dst; + dst_f16.type = GGML_TYPE_F16; + for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2; + dst_f16.data = tmp_dst.get(); + ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true); + } } else { // src and dst are on the same backend CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 379068c9..b96a1714 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1744,6 +1744,60 @@ ggml_cgraph * llm_build_context::build_llama() { //const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f; + + // Sadly, the following is much slower + if (false && model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) { + int n_split = model.splits.size(); + std::vector cur(model.splits.size(), inpL), prev(model.splits.size(), inpL); + + auto combine = [&cur, &prev, n_split, ctx = ctx0] () { + for (int id = 0; id < n_split; ++id) { + if (cur[id]) { + bool first = true; + for (int id1 = 0; id1 < n_split; ++id1) { + if (prev[id1]) { + cur[id] = ggml_add(ctx, cur[id], prev[id1]); + if (first) { + cur[id]->op_params[0] = 0xff; + first = false; + } + } + } + } + } + for (int id = 0; id < n_split; ++id) prev[id] = cur[id]; + }; + + for (int il = 0; il < n_layer; ++il) { + + build_std_attention_split(gf, prev, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, 0, il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + for (int id = 0; id < n_split; ++id) { + if (cur[id]) cur[id] = ggml_get_rows(ctx0, cur[id], inp_out_ids); + if (prev[id]) prev[id] = ggml_get_rows(ctx0, prev[id], inp_out_ids); + } + } + + combine(); + + llm_build_ffn_split(ctx0, lctx, model.layers[il].ffn_norm, prev, + model.layers[il].ffn_up, model.layers[il].ffn_gate, model.layers[il].ffn_down, + LLM_FFN_SILU, cb, il, gf); + + combine(); + } + + auto result = build_output(lctx, ctx0, cur[model.main_gpu], model.output, model.output_norm, cb); + cb(result, "result_output", -1); + ggml_build_forward_expand(gf, result); + + return gf; + } + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL;