Be more careful with having set the device before using a stream

This commit is contained in:
Iwan Kawrakow
2025-12-26 18:17:16 +00:00
parent b79bf6c0ef
commit 07759f172c
2 changed files with 7 additions and 1 deletions

View File

@@ -3429,6 +3429,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
}
@@ -3530,6 +3531,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
// record event on src stream after the copy
ggml_cuda_set_device(cuda_ctx_src->device);
if (!cuda_ctx_src->copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
}
@@ -3547,6 +3549,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
}
} else {
// src and dst are on the same backend
printf("Why is this being invoked?\n");
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
}
return true;

View File

@@ -343,6 +343,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
auto elem_size = ggml_element_size(dst);
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
ggml_cuda_set_device(i);
int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device);
copy_task task;
task.nptr = nhave;
@@ -389,18 +390,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
//printf("Submitted kernels\n");
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
ggml_cuda_set_device(i);
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
}
//printf("Recorded events again\n");
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
ggml_cuda_set_device(i);
for (int jj = 0; jj < nhave; ++jj) {
if (jj == ii) continue;
int j = idx[jj];
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
}
}
//printf("All good so far\n");
ggml_cuda_set_device(ctx.device);
return;
}
auto required_size = nbytes*(nhave-1);