mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
Be more careful with having set the device before using a stream (#1093)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -3429,6 +3429,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
|
|||||||
|
|
||||||
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
|
|
||||||
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3530,6 +3531,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||||||
}
|
}
|
||||||
|
|
||||||
// record event on src stream after the copy
|
// record event on src stream after the copy
|
||||||
|
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||||
if (!cuda_ctx_src->copy_event) {
|
if (!cuda_ctx_src->copy_event) {
|
||||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
||||||
}
|
}
|
||||||
@@ -3547,6 +3549,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// src and dst are on the same backend
|
// src and dst are on the same backend
|
||||||
|
printf("Why is this being invoked?\n");
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -343,6 +343,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
|||||||
auto elem_size = ggml_element_size(dst);
|
auto elem_size = ggml_element_size(dst);
|
||||||
for (int ii = 0; ii < nhave; ++ii) {
|
for (int ii = 0; ii < nhave; ++ii) {
|
||||||
int i = idx[ii];
|
int i = idx[ii];
|
||||||
|
ggml_cuda_set_device(i);
|
||||||
int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device);
|
int this_nelem = std::min(nelem_per_device, nelem - ii*nelem_per_device);
|
||||||
copy_task task;
|
copy_task task;
|
||||||
task.nptr = nhave;
|
task.nptr = nhave;
|
||||||
@@ -389,18 +390,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
|||||||
//printf("Submitted kernels\n");
|
//printf("Submitted kernels\n");
|
||||||
for (int ii = 0; ii < nhave; ++ii) {
|
for (int ii = 0; ii < nhave; ++ii) {
|
||||||
int i = idx[ii];
|
int i = idx[ii];
|
||||||
|
ggml_cuda_set_device(i);
|
||||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||||
}
|
}
|
||||||
//printf("Recorded events again\n");
|
//printf("Recorded events again\n");
|
||||||
for (int ii = 0; ii < nhave; ++ii) {
|
for (int ii = 0; ii < nhave; ++ii) {
|
||||||
int i = idx[ii];
|
int i = idx[ii];
|
||||||
|
ggml_cuda_set_device(i);
|
||||||
for (int jj = 0; jj < nhave; ++jj) {
|
for (int jj = 0; jj < nhave; ++jj) {
|
||||||
if (jj == ii) continue;
|
if (jj == ii) continue;
|
||||||
int j = idx[jj];
|
int j = idx[jj];
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//printf("All good so far\n");
|
ggml_cuda_set_device(ctx.device);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto required_size = nbytes*(nhave-1);
|
auto required_size = nbytes*(nhave-1);
|
||||||
|
|||||||
Reference in New Issue
Block a user