Fix ring reduction (#1114)

* Fix ring reduction

* Actually enable it

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2026-01-07 08:01:31 +02:00
committed by GitHub
parent 54a513768c
commit a82dcbf3ee
2 changed files with 26 additions and 10 deletions

View File

@@ -843,6 +843,7 @@ struct ggml_backend_cuda_context {
int device;
std::string name;
cudaEvent_t copy_event = nullptr;
cudaEvent_t compute_event = nullptr;
bool p2p_enabled = false;
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };

View File

@@ -196,7 +196,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
// i = 0, peer = 1, ichunk = 1 -> copy part 1 from device 1, device 0 now has parts 0, 1, 2, 3
// etc.
//
if (false && dst->ne[1] >= 32) {
if (dst->ne[1] >= 32) {
auto nelem = ggml_nelements(dst);
auto elem_size = ggml_element_size(dst);
auto nelem_per_device = (nelem + nhave - 1)/nhave;
@@ -204,17 +204,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
auto this_ctx = info.all_ctx[i];
if (!this_ctx->copy_event) {
if (!this_ctx->copy_event || !this_ctx->compute_event || required_size > this_ctx->copy_size) {
ggml_cuda_set_device(this_ctx->device);
CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->copy_event, cudaEventDisableTiming));
}
if (required_size > this_ctx->copy_size) {
ggml_cuda_set_device(this_ctx->device);
if (this_ctx->copy_buffer) {
CUDA_CHECK(cudaFree(this_ctx->copy_buffer));
if (!this_ctx->copy_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->copy_event, cudaEventDisableTiming));
}
if (!this_ctx->compute_event) {
CUDA_CHECK(cudaEventCreateWithFlags(&this_ctx->compute_event, cudaEventDisableTiming));
}
if (required_size > this_ctx->copy_size) {
if (this_ctx->copy_buffer) {
CUDA_CHECK(cudaFree(this_ctx->copy_buffer));
}
CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, required_size, this_ctx->device));
this_ctx->copy_size = required_size;
}
CUDA_CHECK(ggml_cuda_device_malloc(&this_ctx->copy_buffer, required_size, this_ctx->device));
this_ctx->copy_size = required_size;
}
}
for (int stage = 0; stage < nhave-1; ++stage) {
@@ -224,10 +228,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
int peer = idx[(ii+1)%nhave];
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
ggml_cuda_set_device(info.all_ctx[peer]->device);
if (stage > 0) {
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
}
CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
(const char *)dst->src[peer]->data + ichunk*nelem_per_device*elem_size, info.all_ctx[peer]->device,
this_nelem*elem_size, info.all_ctx[peer]->stream()));
CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
ichunk = (ichunk + 1)%nhave;
}
ichunk = stage;
for (int ii = 0; ii < nhave; ++ii) {
int i = idx[ii];
int peer = idx[(ii+1)%nhave];
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
ggml_cuda_set_device(info.all_ctx[i]->device);
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
@@ -238,6 +252,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
(const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
}
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
ichunk = (ichunk + 1)%nhave;
}
}