diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 298d3214..09c93606 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -498,7 +498,8 @@ std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(i static std::mutex ggml_cuda_lock; static std::condition_variable ggml_cuda_lock_cv; -static std::atomic ggml_cuda_lock_counter; +//static std::atomic ggml_cuda_lock_counter; +static int ggml_cuda_lock_counter = 0; ggml_backend_cuda_context::ggml_backend_cuda_context(int device) : device(device), name(GGML_CUDA_NAME + std::to_string(device)) { @@ -511,8 +512,10 @@ ggml_backend_cuda_context::ggml_backend_cuda_context(int device) : ggml_backend_cuda_context::~ggml_backend_cuda_context() { + printf("%s: have %d graphs\n", __func__, int(cuda_graphs.size())); + std::unique_lock lock(ggml_cuda_lock); - ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; }); + ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter == 0; }); auto info = const_cast(&ggml_cuda_info()); info->all_ctx[this->device] = nullptr; @@ -520,6 +523,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { if (copy_event != nullptr) { CUDA_CHECK(cudaEventDestroy(copy_event)); } + if (compute_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(compute_event)); + } for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { if (streams[i][j] != nullptr) { @@ -3769,6 +3775,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_cuda_graph if (ggml_is_noop(node)) continue; + if (node->op == GGML_OP_REDUCE) { + use_cuda_graph = false; + break; + } + if (node->op == GGML_OP_MUL_MAT_ID && (node->ne[2] != 1 || node->src[2]->ne[0] != 1)) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG @@ -3988,7 +3999,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); - if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { + if (--ggml_cuda_lock_counter == 0) { ggml_cuda_lock_cv.notify_all(); } } else { @@ -4026,7 +4037,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t // Also disable for multi-gpu for now. TO DO investigate bool use_cuda_graph = !disable_cuda_graphs_due_to_env && cuda_ctx->use_cuda_graph; - auto graph = ggml_cuda_get_graph(*cuda_ctx, ggml_cuda_graph_get_key(cgraph)); + ggml_cuda_graph * graph = nullptr; + if (use_cuda_graph) { + auto graph_key = ggml_cuda_graph_get_key(cgraph); + graph = ggml_cuda_get_graph(*cuda_ctx, graph_key); + } cuda_ctx->cur_graph = graph; bool cuda_graph_update_required = false; @@ -4053,14 +4068,18 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(graph, cgraph, use_cuda_graph, cuda_ctx->stream()); // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - graph->number_consecutive_updates++; - } else { - graph->number_consecutive_updates = 0; + if (use_cuda_graph) { + if (cuda_graph_update_required) { + graph->number_consecutive_updates++; + } else { + graph->number_consecutive_updates = 0; + } } if (graph->number_consecutive_updates >= 4) { graph->disable_due_to_too_many_updates = true; + use_cuda_graph = false; + cuda_ctx->cur_graph = nullptr; #ifndef NDEBUG GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif @@ -4069,15 +4088,16 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture + // Why are we protecting an atomic_int with a mutex? { std::lock_guard lock(ggml_cuda_lock); - ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + ++ggml_cuda_lock_counter; } CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); } - if (!use_cuda_graph) { + if (graph && !use_cuda_graph) { graph->use_cpy_indirection = false; } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 7b7e6b26..a0f30330 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -796,7 +796,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1, char ** dest_ptrs = nullptr; int graph_cpynode_index = -1; #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) { + if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) { dest_ptrs = ctx.cur_graph->dest_ptrs_d; graph_cpynode_index = ctx.cur_graph->graph_cpynode_index; } @@ -813,7 +813,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1, } #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) { + if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) { ctx.cur_graph->graph_cpynode_index = graph_cpynode_index; } #endif @@ -859,7 +859,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c char ** dest_ptrs = nullptr; int graph_cpynode_index = -1; #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) { + if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) { dest_ptrs = ctx.cur_graph->dest_ptrs_d; graph_cpynode_index = ctx.cur_graph->graph_cpynode_index; } @@ -874,7 +874,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c } #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) - if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) { + if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) { ctx.cur_graph->graph_cpynode_index = graph_cpynode_index; } #endif diff --git a/src/llama.cpp b/src/llama.cpp index c6b87eb0..becc0626 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4627,7 +4627,7 @@ struct llama_context * llama_new_context_with_model( // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU auto params = cparams.cuda_params; std::string new_params; - if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { + if (false && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { static const std::string extra_string{"graphs=0"}; if (params) new_params = std::string{(const char *)params} + ','; new_params += extra_string;