WIP

2026-04-29 10:51:51 +00:00 · 2025-12-08 15:44:53 +00:00
parent 66f21fb174
commit c83d2fd335
3 changed files with 103 additions and 15 deletions
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
    std::vector<uint32_t> unique_ids;
    ggml_tensor * last_ids_tensor = nullptr;

+    for (int i = 0; i < sched->n_splits; i++) {
+        auto split = &splits[i];
+        if (split->n_inputs < 1) continue;
+        int n_host_inputs = 0;
+        int n_peer_inputs = 0;
+        for (int j = 0; j < split->n_inputs; ++j) {
+            if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) {
+                ++n_host_inputs;
+            } else {
+                ++n_peer_inputs;
+            }
+        }
+        if (n_host_inputs == 0 && n_peer_inputs == 1) {
+            auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy);
+            printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs,
+                    ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer);
+        } else {
+            printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs);
+        }
+    }
+
    for (int i = 0; i < sched->n_splits; i++) {
 #if IK_PRINT_TIMING
        int64_t tim1 = ggml_time_us();
@@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s

            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
+                //auto tim1 = ggml_time_us();
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                } else {
                    ggml_backend_synchronize(split_backend);
                }
+                //auto tim2 = ggml_time_us();
+                //printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
                ggml_backend_tensor_copy(input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
+                //auto tim1 = ggml_time_us();
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
                } else {
                    ggml_backend_synchronize(split_backend);
                }
+                //auto tim2 = ggml_time_us();
+                //printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);

                ggml_tensor * node = split->graph.nodes[0];
                if (sched->only_active_experts && split->graph.n_nodes > 0 &&
@@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    //auto tim1 = ggml_time_us();
                    ggml_backend_synchronize(input_backend);
+                    //auto tim2 = ggml_time_us();
+                    //printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
+                    //tim1 = ggml_time_us();
                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                        ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                    } else {
                        ggml_backend_synchronize(split_backend);
                    }
+                    //tim2 = ggml_time_us();
+                    //printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
                    ggml_backend_tensor_copy(input, input_cpy);
                }
            }
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
    }

    if (backend_src != backend_dst) {
+        ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
+        ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 #ifdef GGML_CUDA_NO_PEER_COPY
            return false;
 #else
-            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+
+            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) {
                //
                // The goal here is to reduce traffic between GPU's, which is entirely non-negligible
                // for prompt processing.
@@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
                // iBut for some reason the following is not working.
                // Can somebody tell me why?
                //
-                ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
-                ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
+                tmp_src.alloc(ggml_nelements(src));
+                tmp_dst.alloc(ggml_nelements(dst));

                auto src_f16 = *src;
                src_f16.type = GGML_TYPE_F16;
                for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
                src_f16.data = tmp_src.get();

-                auto dst_f16 = *dst;
-                dst_f16.type = GGML_TYPE_F16;
-                for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
-                dst_f16.data = tmp_dst.get();
-
                ggml_cuda_set_device(cuda_ctx_src->device);
                ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
-
-                CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
-
-                ggml_cuda_set_device(cuda_ctx_dst->device);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
-                ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);

+                //printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
+                CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
            } else {
+                //if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
                CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
            }
 #endif
@@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_

        // wait on dst stream for the copy to complete
        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
+        if (tmp_dst.ptr) {
+            auto dst_f16 = *dst;
+            dst_f16.type = GGML_TYPE_F16;
+            for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
+            dst_f16.data = tmp_dst.get();
+            ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
+        }
    } else {
        // src and dst are on the same backend
        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));