This commit is contained in:
Kawrakow
2025-12-08 15:44:53 +00:00
parent 66f21fb174
commit c83d2fd335
3 changed files with 103 additions and 15 deletions

View File

@@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
std::vector<uint32_t> unique_ids;
ggml_tensor * last_ids_tensor = nullptr;
for (int i = 0; i < sched->n_splits; i++) {
auto split = &splits[i];
if (split->n_inputs < 1) continue;
int n_host_inputs = 0;
int n_peer_inputs = 0;
for (int j = 0; j < split->n_inputs; ++j) {
if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) {
++n_host_inputs;
} else {
++n_peer_inputs;
}
}
if (n_host_inputs == 0 && n_peer_inputs == 1) {
auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy);
printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs,
ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer);
} else {
printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs);
}
}
for (int i = 0; i < sched->n_splits; i++) {
#if IK_PRINT_TIMING
int64_t tim1 = ggml_time_us();
@@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
//auto tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//auto tim2 = ggml_time_us();
//printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
//auto tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//auto tim2 = ggml_time_us();
//printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
ggml_tensor * node = split->graph.nodes[0];
if (sched->only_active_experts && split->graph.n_nodes > 0 &&
@@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
//auto tim1 = ggml_time_us();
ggml_backend_synchronize(input_backend);
//auto tim2 = ggml_time_us();
//printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
//tim1 = ggml_time_us();
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
//tim2 = ggml_time_us();
//printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
ggml_backend_tensor_copy(input, input_cpy);
}
}

View File

@@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
if (backend_src != backend_dst) {
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) {
//
// The goal here is to reduce traffic between GPU's, which is entirely non-negligible
// for prompt processing.
@@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
// iBut for some reason the following is not working.
// Can somebody tell me why?
//
ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
tmp_src.alloc(ggml_nelements(src));
tmp_dst.alloc(ggml_nelements(dst));
auto src_f16 = *src;
src_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
src_f16.data = tmp_src.get();
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_set_device(cuda_ctx_src->device);
ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
ggml_cuda_set_device(cuda_ctx_dst->device);
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
//printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
} else {
//if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
}
#endif
@@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
// wait on dst stream for the copy to complete
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
if (tmp_dst.ptr) {
auto dst_f16 = *dst;
dst_f16.type = GGML_TYPE_F16;
for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
dst_f16.data = tmp_dst.get();
ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
}
} else {
// src and dst are on the same backend
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));