From 7146de451d47aae7be10818c4ab3a87767ab15f7 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 26 Dec 2025 19:19:41 +0100 Subject: [PATCH] Be more careful with having set the device before using a stream (#1093) Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-cuda.cu | 3 +++ ggml/src/ggml-cuda/reduce.cu | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index ad641c85..e28d229c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3429,6 +3429,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + ggml_cuda_set_device(cuda_ctx->device); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); } @@ -3530,6 +3531,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ } // record event on src stream after the copy + ggml_cuda_set_device(cuda_ctx_src->device); if (!cuda_ctx_src->copy_event) { CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); } @@ -3547,6 +3549,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ } } else { // src and dst are on the same backend + printf("Why is this being invoked?\n"); CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index bbba3243..53d54f74 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -343,6 +343,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ auto elem_size = ggml_element_size(dst); for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; + ggml_cuda_set_device(i); int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device); copy_task task; task.nptr = nhave; @@ -389,18 +390,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ //printf("Submitted kernels\n"); for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; + ggml_cuda_set_device(i); CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); } //printf("Recorded events again\n"); for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; + ggml_cuda_set_device(i); for (int jj = 0; jj < nhave; ++jj) { if (jj == ii) continue; int j = idx[jj]; CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); } } - //printf("All good so far\n"); + ggml_cuda_set_device(ctx.device); return; } auto required_size = nbytes*(nhave-1);