diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 298d3214..09c93606 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -498,7 +498,8 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
 
 static std::mutex ggml_cuda_lock;
 static std::condition_variable ggml_cuda_lock_cv;
-static std::atomic<int> ggml_cuda_lock_counter;
+//static std::atomic<int> ggml_cuda_lock_counter;
+static int ggml_cuda_lock_counter = 0;
 
 ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
     device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
@@ -511,8 +512,10 @@ ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
 
 ggml_backend_cuda_context::~ggml_backend_cuda_context() {
 
+    printf("%s: have %d graphs\n", __func__, int(cuda_graphs.size()));
+
     std::unique_lock<std::mutex> lock(ggml_cuda_lock);
-    ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
+    ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter == 0; });
 
     auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
     info->all_ctx[this->device] = nullptr;
@@ -520,6 +523,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
     if (copy_event != nullptr) {
         CUDA_CHECK(cudaEventDestroy(copy_event));
     }
+    if (compute_event != nullptr) {
+        CUDA_CHECK(cudaEventDestroy(compute_event));
+    }
     for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
         for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
             if (streams[i][j] != nullptr) {
@@ -3769,6 +3775,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_cuda_graph
 
         if (ggml_is_noop(node)) continue;
 
+        if (node->op == GGML_OP_REDUCE) {
+            use_cuda_graph = false;
+            break;
+        }
+
         if (node->op == GGML_OP_MUL_MAT_ID && (node->ne[2] != 1 || node->src[2]->ne[0] != 1)) {
             use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
@@ -3988,7 +3999,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             graph_evaluated_or_captured = true; // CUDA graph has been captured
 
             std::lock_guard<std::mutex> lock(ggml_cuda_lock);
-            if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+            if (--ggml_cuda_lock_counter == 0) {
                 ggml_cuda_lock_cv.notify_all();
             }
         } else {
@@ -4026,7 +4037,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
     // Also disable for multi-gpu for now. TO DO investigate
     bool use_cuda_graph = !disable_cuda_graphs_due_to_env && cuda_ctx->use_cuda_graph;
 
-    auto graph = ggml_cuda_get_graph(*cuda_ctx, ggml_cuda_graph_get_key(cgraph));
+    ggml_cuda_graph * graph = nullptr;
+    if (use_cuda_graph) {
+        auto graph_key = ggml_cuda_graph_get_key(cgraph);
+        graph = ggml_cuda_get_graph(*cuda_ctx, graph_key);
+    }
     cuda_ctx->cur_graph = graph;
 
     bool cuda_graph_update_required = false;
@@ -4053,14 +4068,18 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
         use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(graph, cgraph, use_cuda_graph, cuda_ctx->stream());
 
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (use_cuda_graph && cuda_graph_update_required) {
-            graph->number_consecutive_updates++;
-        } else {
-            graph->number_consecutive_updates = 0;
+        if (use_cuda_graph) {
+            if (cuda_graph_update_required) {
+                graph->number_consecutive_updates++;
+            } else {
+                graph->number_consecutive_updates = 0;
+            }
         }
 
         if (graph->number_consecutive_updates >= 4) {
             graph->disable_due_to_too_many_updates = true;
+            use_cuda_graph = false;
+            cuda_ctx->cur_graph = nullptr;
 #ifndef NDEBUG
             GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
 #endif
@@ -4069,15 +4088,16 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
     if (use_cuda_graph && cuda_graph_update_required) {
         // Start CUDA graph capture
+        // Why are we protecting an atomic_int with a mutex?
         {
             std::lock_guard<std::mutex> lock(ggml_cuda_lock);
-            ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+            ++ggml_cuda_lock_counter;
         }
 
         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
     }
 
-    if (!use_cuda_graph) {
+    if (graph && !use_cuda_graph) {
         graph->use_cpy_indirection = false;
     }
 
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 7b7e6b26..a0f30330 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -796,7 +796,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1,
     char ** dest_ptrs = nullptr;
     int graph_cpynode_index = -1;
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
+    if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
         dest_ptrs = ctx.cur_graph->dest_ptrs_d;
         graph_cpynode_index = ctx.cur_graph->graph_cpynode_index;
     }
@@ -813,7 +813,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1,
     }
 
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
+    if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
         ctx.cur_graph->graph_cpynode_index = graph_cpynode_index;
     }
 #endif
@@ -859,7 +859,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c
     char ** dest_ptrs = nullptr;
     int graph_cpynode_index = -1;
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
+    if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
         dest_ptrs = ctx.cur_graph->dest_ptrs_d;
         graph_cpynode_index = ctx.cur_graph->graph_cpynode_index;
     }
@@ -874,7 +874,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c
     }
 
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
+    if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
         ctx.cur_graph->graph_cpynode_index = graph_cpynode_index;
     }
 #endif
diff --git a/src/llama.cpp b/src/llama.cpp
index c6b87eb0..becc0626 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4627,7 +4627,7 @@ struct llama_context * llama_new_context_with_model(
             // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU
             auto params = cparams.cuda_params;
             std::string new_params;
-            if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
+            if (false && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
                 static const std::string extra_string{"graphs=0"};
                 if (params) new_params = std::string{(const char *)params} + ',';
                 new_params += extra_string;