This commit is contained in:
Kawrakow
2025-12-08 15:44:53 +00:00
parent 66f21fb174
commit c83d2fd335
3 changed files with 103 additions and 15 deletions

View File

@@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
std::vector<uint32_t> unique_ids;
ggml_tensor * last_ids_tensor = nullptr;
for (int i = 0; i < sched->n_splits; i++) {
auto split = &splits[i];
if (split->n_inputs < 1) continue;
int n_host_inputs = 0;
int n_peer_inputs = 0;
for (int j = 0; j < split->n_inputs; ++j) {
if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) {
++n_host_inputs;
} else {
++n_peer_inputs;
}
}
if (n_host_inputs == 0 && n_peer_inputs == 1) {
auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy);
printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs,
ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer);
} else {
printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs);
}
}
for (int i = 0; i < sched->n_splits; i++) {
#if IK_PRINT_TIMING
int64_t tim1 = ggml_time_us();
@@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
//auto tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//auto tim2 = ggml_time_us();
//printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
//auto tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//auto tim2 = ggml_time_us();
//printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
ggml_tensor * node = split->graph.nodes[0];
if (sched->only_active_experts && split->graph.n_nodes > 0 &&
@@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
//auto tim1 = ggml_time_us();
ggml_backend_synchronize(input_backend);
//auto tim2 = ggml_time_us();
//printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
//tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//tim2 = ggml_time_us();
//printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
ggml_backend_tensor_copy(input, input_cpy);
}
}

View File

@@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
if (backend_src != backend_dst) {
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) {
//
// The goal here is to reduce traffic between GPU's, which is entirely non-negligible
// for prompt processing.
@@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
// iBut for some reason the following is not working.
// Can somebody tell me why?
//
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
tmp_src.alloc(ggml_nelements(src));
tmp_dst.alloc(ggml_nelements(dst));
auto src_f16 = *src;
src_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
src_f16.data = tmp_src.get();
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_set_device(cuda_ctx_src->device);
ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
ggml_cuda_set_device(cuda_ctx_dst->device);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
//printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
} else {
//if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
}
#endif
@@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
// wait on dst stream for the copy to complete
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
if (tmp_dst.ptr) {
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
}
} else {
// src and dst are on the same backend
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));

View File

@@ -1744,6 +1744,60 @@ ggml_cgraph * llm_build_context::build_llama() {
//const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
// Sadly, the following is much slower
if (false && model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) {
int n_split = model.splits.size();
std::vector<ggml_tensor *> cur(model.splits.size(), inpL), prev(model.splits.size(), inpL);
auto combine = [&cur, &prev, n_split, ctx = ctx0] () {
for (int id = 0; id < n_split; ++id) {
if (cur[id]) {
bool first = true;
for (int id1 = 0; id1 < n_split; ++id1) {
if (prev[id1]) {
cur[id] = ggml_add(ctx, cur[id], prev[id1]);
if (first) {
cur[id]->op_params[0] = 0xff;
first = false;
}
}
}
}
}
for (int id = 0; id < n_split; ++id) prev[id] = cur[id];
};
for (int il = 0; il < n_layer; ++il) {
build_std_attention_split(gf, prev, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, 0, il);
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
for (int id = 0; id < n_split; ++id) {
if (cur[id]) cur[id] = ggml_get_rows(ctx0, cur[id], inp_out_ids);
if (prev[id]) prev[id] = ggml_get_rows(ctx0, prev[id], inp_out_ids);
}
}
combine();
llm_build_ffn_split(ctx0, lctx, model.layers[il].ffn_norm, prev,
model.layers[il].ffn_up, model.layers[il].ffn_gate, model.layers[il].ffn_down,
LLM_FFN_SILU, cb, il, gf);
combine();
}
auto result = build_output(lctx, ctx0, cur[model.main_gpu], model.output, model.output_norm, cb);
cb(result, "result_output", -1);
ggml_build_forward_expand(gf, result);
return gf;
}
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;