diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index f7c1593b..15359647 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1879,6 +1879,27 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
     std::vector<uint32_t> unique_ids;
     ggml_tensor * last_ids_tensor = nullptr;
 
+    for (int i = 0; i < sched->n_splits; i++) {
+        auto split = &splits[i];
+        if (split->n_inputs < 1) continue;
+        int n_host_inputs = 0;
+        int n_peer_inputs = 0;
+        for (int j = 0; j < split->n_inputs; ++j) {
+            if (ggml_backend_buffer_is_host(split->inputs[j]->buffer)) {
+                ++n_host_inputs;
+            } else {
+                ++n_peer_inputs;
+            }
+        }
+        if (n_host_inputs == 0 && n_peer_inputs == 1) {
+            auto input_cpy = tensor_copy(split->inputs[0], split->backend_id, sched->cur_copy);
+            printf("Split %4d: backend = %d, %d host, %d peer inputs, data size = %zu, buffer = %p\n", i, split->backend_id, n_host_inputs, n_peer_inputs,
+                    ggml_nbytes(split->inputs[0]), (const void *)input_cpy->buffer);
+        } else {
+            printf("Split %4d: backend = %d, %d host, %d peer inputs\n", i, split->backend_id, n_host_inputs, n_peer_inputs);
+        }
+    }
+
     for (int i = 0; i < sched->n_splits; i++) {
 #if IK_PRINT_TIMING
         int64_t tim1 = ggml_time_us();
@@ -1897,19 +1918,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
             if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                 // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
+                //auto tim1 = ggml_time_us();
                 if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                     ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
+                //auto tim2 = ggml_time_us();
+                //printf("Synchronized split backend %s (1) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
                 ggml_backend_tensor_copy(input, input_cpy);
             } else {
                 // wait for the split backend to finish using the input before overwriting it
+                //auto tim1 = ggml_time_us();
                 if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                     ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
+                //auto tim2 = ggml_time_us();
+                //printf("Synchronized split backend %s (2) for %d us to copy %s\n", ggml_backend_name(split_backend), int(tim2-tim1), input->name);
 
                 ggml_tensor * node = split->graph.nodes[0];
                 if (sched->only_active_experts && split->graph.n_nodes > 0 &&
@@ -2000,12 +2027,18 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                 // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                 if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    //auto tim1 = ggml_time_us();
                     ggml_backend_synchronize(input_backend);
+                    //auto tim2 = ggml_time_us();
+                    //printf("Synchronized input backend %s for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
+                    //tim1 = ggml_time_us();
                     if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                         ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                     } else {
                         ggml_backend_synchronize(split_backend);
                     }
+                    //tim2 = ggml_time_us();
+                    //printf("Synchronized split backend %s (3) for %d us to copy %s\n", ggml_backend_name(input_backend), int(tim2-tim1), input->name);
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
             }
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index cd0bf889..74d79670 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3440,6 +3440,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
     }
 
     if (backend_src != backend_dst) {
+        ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
+        ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
         // copy on src stream
         if (cuda_ctx_src->device == cuda_ctx_dst->device) {
             CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3447,7 +3449,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 #ifdef GGML_CUDA_NO_PEER_COPY
             return false;
 #else
-            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+
+            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ggml_nrows(src) >= 32) {
                 //
                 // The goal here is to reduce traffic between GPU's, which is entirely non-negligible
                 // for prompt processing.
@@ -3458,30 +3461,21 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
                 // iBut for some reason the following is not working.
                 // Can somebody tell me why?
                 //
-                ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
-                ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
+                tmp_src.alloc(ggml_nelements(src));
+                tmp_dst.alloc(ggml_nelements(dst));
 
                 auto src_f16 = *src;
                 src_f16.type = GGML_TYPE_F16;
                 for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
                 src_f16.data = tmp_src.get();
 
-                auto dst_f16 = *dst;
-                dst_f16.type = GGML_TYPE_F16;
-                for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
-                dst_f16.data = tmp_dst.get();
-
                 ggml_cuda_set_device(cuda_ctx_src->device);
                 ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
-
-                CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
-
-                ggml_cuda_set_device(cuda_ctx_dst->device);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
-                ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
 
+                //printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
+                CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
             } else {
+                //if (src->type == GGML_TYPE_F32) printf("cudaMemcpyPeerAsync(%s -> %s)\n", src->name, dst->name);
                 CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
             }
 #endif
@@ -3497,6 +3491,13 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 
         // wait on dst stream for the copy to complete
         CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
+        if (tmp_dst.ptr) {
+            auto dst_f16 = *dst;
+            dst_f16.type = GGML_TYPE_F16;
+            for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
+            dst_f16.data = tmp_dst.get();
+            ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
+        }
     } else {
         // src and dst are on the same backend
         CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 379068c9..b96a1714 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1744,6 +1744,60 @@ ggml_cgraph * llm_build_context::build_llama() {
 
     //const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
     const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
+
+    // Sadly, the following is much slower
+    if (false && model.split_mode == LLAMA_SPLIT_MODE_GRAPH && model.splits.size() > 1) {
+        int n_split = model.splits.size();
+        std::vector<ggml_tensor *> cur(model.splits.size(), inpL), prev(model.splits.size(), inpL);
+
+        auto combine = [&cur, &prev, n_split, ctx = ctx0] () {
+            for (int id = 0; id < n_split; ++id) {
+                if (cur[id]) {
+                    bool first = true;
+                    for (int id1 = 0; id1 < n_split; ++id1) {
+                        if (prev[id1]) {
+                            cur[id] = ggml_add(ctx, cur[id], prev[id1]);
+                            if (first) {
+                                cur[id]->op_params[0] = 0xff;
+                                first = false;
+                            }
+                        }
+                    }
+                }
+            }
+            for (int id = 0; id < n_split; ++id) prev[id] = cur[id];
+        };
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            build_std_attention_split(gf, prev, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, 0, il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                for (int id = 0; id < n_split; ++id) {
+                    if (cur[id])  cur[id]  = ggml_get_rows(ctx0, cur[id],  inp_out_ids);
+                    if (prev[id]) prev[id] = ggml_get_rows(ctx0, prev[id], inp_out_ids);
+                }
+            }
+
+            combine();
+
+            llm_build_ffn_split(ctx0, lctx, model.layers[il].ffn_norm, prev,
+                    model.layers[il].ffn_up, model.layers[il].ffn_gate, model.layers[il].ffn_down,
+                    LLM_FFN_SILU, cb, il, gf);
+
+            combine();
+        }
+
+        auto result = build_output(lctx, ctx0, cur[model.main_gpu], model.output, model.output_norm, cb);
+        cb(result, "result_output", -1);
+        ggml_build_forward_expand(gf, result);
+
+        return gf;
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;