From 7146de451d47aae7be10818c4ab3a87767ab15f7 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Fri, 26 Dec 2025 19:19:41 +0100
Subject: [PATCH] Be more careful with having set the device before using a
 stream (#1093)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml/src/ggml-cuda.cu        | 3 +++
 ggml/src/ggml-cuda/reduce.cu | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index ad641c85..e28d229c 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3429,6 +3429,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
 
     GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
+    ggml_cuda_set_device(cuda_ctx->device);
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
 }
 
@@ -3530,6 +3531,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
         }
 
         // record event on src stream after the copy
+        ggml_cuda_set_device(cuda_ctx_src->device);
         if (!cuda_ctx_src->copy_event) {
             CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
         }
@@ -3547,6 +3549,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
         }
     } else {
         // src and dst are on the same backend
+        printf("Why is this being invoked?\n");
         CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
     }
     return true;
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
index bbba3243..53d54f74 100644
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -343,6 +343,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         auto elem_size = ggml_element_size(dst);
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
+            ggml_cuda_set_device(i);
             int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device);
             copy_task task;
             task.nptr = nhave;
@@ -389,18 +390,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         //printf("Submitted kernels\n");
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
+            ggml_cuda_set_device(i);
             CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
         }
         //printf("Recorded events again\n");
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
+            ggml_cuda_set_device(i);
             for (int jj = 0; jj < nhave; ++jj) {
                 if (jj == ii) continue;
                 int j = idx[jj];
                 CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
             }
         }
-        //printf("All good so far\n");
+        ggml_cuda_set_device(ctx.device);
         return;
     }
     auto required_size = nbytes*(nhave-1);