mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 06:34:13 +00:00
WIP
This commit is contained in:
@@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
std::vector<uint32_t> unique_ids;
|
||||
ggml_tensor * last_ids_tensor = nullptr;
|
||||
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
auto split = &splits[i];
|
||||
if (split->n_inputs < 1) continue;
|
||||
int n_host_inputs = 0;
|
||||
int n_peer_inputs = 0;
|
||||
for (int j = 0; j < split->n_inputs; ++j) {
|
||||
if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) {
|
||||
++n_host_inputs;
|
||||
} else {
|
||||
++n_peer_inputs;
|
||||
}
|
||||
}
|
||||
if (n_host_inputs == 0 && n_peer_inputs == 1) {
|
||||
auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy);
|
||||
printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs,
|
||||
ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer);
|
||||
} else {
|
||||
printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
#if IK_PRINT_TIMING
|
||||
int64_t tim1 = ggml_time_us();
|
||||
@@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
//auto tim1 = ggml_time_us();
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
//auto tim2 = ggml_time_us();
|
||||
//printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
//auto tim1 = ggml_time_us();
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
//auto tim2 = ggml_time_us();
|
||||
//printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
|
||||
|
||||
ggml_tensor * node = split->graph.nodes[0];
|
||||
if (sched->only_active_experts && split->graph.n_nodes > 0 &&
|
||||
@@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
//auto tim1 = ggml_time_us();
|
||||
ggml_backend_synchronize(input_backend);
|
||||
//auto tim2 = ggml_time_us();
|
||||
//printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
|
||||
//tim1 = ggml_time_us();
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
//tim2 = ggml_time_us();
|
||||
//printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
}
|
||||
|
||||
if (backend_src != backend_dst) {
|
||||
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
|
||||
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
@@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
#else
|
||||
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
|
||||
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) {
|
||||
//
|
||||
// The goal here is to reduce traffic between GPU's, which is entirely non-negligible
|
||||
// for prompt processing.
|
||||
@@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
// iBut for some reason the following is not working.
|
||||
// Can somebody tell me why?
|
||||
//
|
||||
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
|
||||
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
|
||||
tmp_src.alloc(ggml_nelements(src));
|
||||
tmp_dst.alloc(ggml_nelements(dst));
|
||||
|
||||
auto src_f16 = *src;
|
||||
src_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
|
||||
src_f16.data = tmp_src.get();
|
||||
|
||||
auto dst_f16 = *dst;
|
||||
dst_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
|
||||
dst_f16.data = tmp_dst.get();
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
|
||||
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
|
||||
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
|
||||
|
||||
//printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
|
||||
} else {
|
||||
//if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
}
|
||||
#endif
|
||||
@@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
|
||||
// wait on dst stream for the copy to complete
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
|
||||
if (tmp_dst.ptr) {
|
||||
auto dst_f16 = *dst;
|
||||
dst_f16.type = GGML_TYPE_F16;
|
||||
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
|
||||
dst_f16.data = tmp_dst.get();
|
||||
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
|
||||
}
|
||||
} else {
|
||||
// src and dst are on the same backend
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@@ -1744,6 +1744,60 @@ ggml_cgraph * llm_build_context::build_llama() {
|
||||
|
||||
//const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
|
||||
|
||||
// Sadly, the following is much slower
|
||||
if (false && model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) {
|
||||
int n_split = model.splits.size();
|
||||
std::vector<ggml_tensor *> cur(model.splits.size(), inpL), prev(model.splits.size(), inpL);
|
||||
|
||||
auto combine = [&cur, &prev, n_split, ctx = ctx0] () {
|
||||
for (int id = 0; id < n_split; ++id) {
|
||||
if (cur[id]) {
|
||||
bool first = true;
|
||||
for (int id1 = 0; id1 < n_split; ++id1) {
|
||||
if (prev[id1]) {
|
||||
cur[id] = ggml_add(ctx, cur[id], prev[id1]);
|
||||
if (first) {
|
||||
cur[id]->op_params[0] = 0xff;
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int id = 0; id < n_split; ++id) prev[id] = cur[id];
|
||||
};
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
build_std_attention_split(gf, prev, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, 0, il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
for (int id = 0; id < n_split; ++id) {
|
||||
if (cur[id]) cur[id] = ggml_get_rows(ctx0, cur[id], inp_out_ids);
|
||||
if (prev[id]) prev[id] = ggml_get_rows(ctx0, prev[id], inp_out_ids);
|
||||
}
|
||||
}
|
||||
|
||||
combine();
|
||||
|
||||
llm_build_ffn_split(ctx0, lctx, model.layers[il].ffn_norm, prev,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_gate, model.layers[il].ffn_down,
|
||||
LLM_FFN_SILU, cb, il, gf);
|
||||
|
||||
combine();
|
||||
}
|
||||
|
||||
auto result = build_output(lctx, ctx0, cur[model.main_gpu], model.output, model.output_norm, cb);
|
||||
cb(result, "result_output", -1);
|
||||
ggml_build_forward_expand(gf, result);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user