cuda: set device to src device before p2p copy (#1073)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-13 15:30:03 +00:00 · 2025-12-17 12:50:34 +01:00
parent 7bb79eff48
commit 21fc9322f9
1 changed files with 1 additions and 2 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3451,6 +3451,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
                needs_f16_f32_copy = true;

            } else {
+                ggml_cuda_set_device(cuda_ctx_src->device);
                CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
            }
 #endif
@@ -3458,10 +3459,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_

        // record event on src stream after the copy
        if (!cuda_ctx_src->copy_event) {
-            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
        }
-
        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));

        // wait on dst stream for the copy to complete