diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index f40c8df8..a2f9716c 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -479,6 +479,11 @@ static std::atomic<int> ggml_cuda_lock_counter;
 
 ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
     device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
+    auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
+    if (info->all_ctx[device]) {
+        GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
+    }
+    info->all_ctx[device] = this;
 }
 
 ggml_backend_cuda_context::~ggml_backend_cuda_context() {
@@ -486,6 +491,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
     std::unique_lock<std::mutex> lock(ggml_cuda_lock);
     ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
 
+    auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
+    info->all_ctx[this->device] = nullptr;
+
     if (copy_event != nullptr) {
         CUDA_CHECK(cudaEventDestroy(copy_event));
     }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 6c852807..86b52440 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -742,6 +742,8 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K_R4> {
 
 //////////////////////
 
+struct ggml_backend_cuda_context;
+
 struct ggml_cuda_device_info {
     int device_count;
 
@@ -760,6 +762,7 @@ struct ggml_cuda_device_info {
     std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
 
 #ifdef GGML_USE_NCCL
+    ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
     ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
     bool have_nccl;
 #endif
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
new file mode 100644
index 00000000..c09d2903
--- /dev/null
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -0,0 +1,64 @@
+//
+// Copyright (C) 2023-2024 The ggml authors
+// Copyright (C) 2024 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+//
+
+#include "reduce.cuh"
+
+void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+
+    auto op = (ggml_op)dst->op_params[0];
+    GGML_ASSERT(op == GGML_OP_ADD);
+    int nreduce = dst->op_params[1];
+    int nhave   = dst->op_params[2];
+    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(nhave >=2 && nhave <= nreduce);
+
+    //printf("============================== %s on device %d\n", __func__, ctx.device);
+
+#ifdef GGML_USE_NCCL
+    auto & info = ggml_cuda_info();
+    GGML_ASSERT(info.have_nccl);
+    GGML_ASSERT(info.device_count == nreduce);
+    auto type = dst->type;
+    //int device = ctx.device;
+    if (nreduce != info.device_count) {
+        GGML_ABORT("Not implemented");
+    }
+    ncclGroupStart();
+    for (int i = 0; i < nreduce; ++i) {
+        ncclComm_t this_comm;
+        if (nhave == nreduce) {
+            this_comm = info.nccl_coms[i];
+        } else {
+            auto status = ncclCommSplit(info.nccl_coms[i], dst->src[i] ? 1 : 0, i, &this_comm, NULL);
+            GGML_ASSERT(status == ncclSuccess);
+        }
+        auto stream = info.all_ctx[i]->stream();
+        GGML_ASSERT(stream);
+        ncclResult_t status;
+        if (type == GGML_TYPE_F32) {
+            status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
+                                   dst->src[i] ? dst->src[i]->data : nullptr,
+                                   ggml_nelements(dst),
+                                   ncclFloat, ncclSum, this_comm, stream);
+        } else {
+            status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
+                                   dst->src[i] ? dst->src[i]->data : nullptr,
+                                   ggml_nelements(dst),
+                                   ncclHalf, ncclSum, this_comm, stream);
+        }
+        if (status != ncclSuccess) {
+            fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
+            GGML_ABORT("Fatal error");
+        }
+    }
+    ncclGroupEnd();
+    return;
+#endif
+    fprintf(stderr, "%s: not implemented without NCCL\n", __func__);
+    GGML_ABORT("Fatal error");
+}
diff --git a/ggml/src/ggml-cuda/reduce.cuh b/ggml/src/ggml-cuda/reduce.cuh
new file mode 100644
index 00000000..2338c2c6
--- /dev/null
+++ b/ggml/src/ggml-cuda/reduce.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+#define CUDA_REDUCE_BLOCK_SIZE 256
+
+void ggml_cuda_op_reduce(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_fake_cpy(ggml_backend_cuda_context & ctx, ggml_tensor * dst);