Split mode graph: use CUDA graphs (#1177)

* Use GUDA graphs also when theretensor overrides

* Change graph key

* This seems to work
This commit is contained in:
Kawrakow
2026-01-22 12:38:36 +02:00
committed by GitHub
parent 573e23679d
commit 851fda3509
3 changed files with 35 additions and 15 deletions

View File

@@ -498,7 +498,8 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
static std::mutex ggml_cuda_lock;
static std::condition_variable ggml_cuda_lock_cv;
static std::atomic<int> ggml_cuda_lock_counter;
//static std::atomic<int> ggml_cuda_lock_counter;
static int ggml_cuda_lock_counter = 0;
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
@@ -511,8 +512,10 @@ ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
printf("%s: have %d graphs\n", __func__, int(cuda_graphs.size()));
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter == 0; });
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
info->all_ctx[this->device] = nullptr;
@@ -520,6 +523,9 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
if (copy_event != nullptr) {
CUDA_CHECK(cudaEventDestroy(copy_event));
}
if (compute_event != nullptr) {
CUDA_CHECK(cudaEventDestroy(compute_event));
}
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
if (streams[i][j] != nullptr) {
@@ -3769,6 +3775,11 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_cuda_graph
if (ggml_is_noop(node)) continue;
if (node->op == GGML_OP_REDUCE) {
use_cuda_graph = false;
break;
}
if (node->op == GGML_OP_MUL_MAT_ID && (node->ne[2] != 1 || node->src[2]->ne[0] != 1)) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
@@ -3988,7 +3999,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
graph_evaluated_or_captured = true; // CUDA graph has been captured
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
if (--ggml_cuda_lock_counter == 0) {
ggml_cuda_lock_cv.notify_all();
}
} else {
@@ -4026,7 +4037,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
// Also disable for multi-gpu for now. TO DO investigate
bool use_cuda_graph = !disable_cuda_graphs_due_to_env && cuda_ctx->use_cuda_graph;
auto graph = ggml_cuda_get_graph(*cuda_ctx, ggml_cuda_graph_get_key(cgraph));
ggml_cuda_graph * graph = nullptr;
if (use_cuda_graph) {
auto graph_key = ggml_cuda_graph_get_key(cgraph);
graph = ggml_cuda_get_graph(*cuda_ctx, graph_key);
}
cuda_ctx->cur_graph = graph;
bool cuda_graph_update_required = false;
@@ -4053,14 +4068,18 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(graph, cgraph, use_cuda_graph, cuda_ctx->stream());
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {
graph->number_consecutive_updates++;
} else {
graph->number_consecutive_updates = 0;
if (use_cuda_graph) {
if (cuda_graph_update_required) {
graph->number_consecutive_updates++;
} else {
graph->number_consecutive_updates = 0;
}
}
if (graph->number_consecutive_updates >= 4) {
graph->disable_due_to_too_many_updates = true;
use_cuda_graph = false;
cuda_ctx->cur_graph = nullptr;
#ifndef NDEBUG
GGML_CUDA_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
@@ -4069,15 +4088,16 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
if (use_cuda_graph && cuda_graph_update_required) {
// Start CUDA graph capture
// Why are we protecting an atomic_int with a mutex?
{
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
++ggml_cuda_lock_counter;
}
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
if (!use_cuda_graph) {
if (graph && !use_cuda_graph) {
graph->use_cpy_indirection = false;
}

View File

@@ -796,7 +796,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1,
char ** dest_ptrs = nullptr;
int graph_cpynode_index = -1;
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
dest_ptrs = ctx.cur_graph->dest_ptrs_d;
graph_cpynode_index = ctx.cur_graph->graph_cpynode_index;
}
@@ -813,7 +813,7 @@ bool ggml_cuda_cpy_2(ggml_backend_cuda_context & ctx, const ggml_tensor * src1,
}
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
ctx.cur_graph->graph_cpynode_index = graph_cpynode_index;
}
#endif
@@ -859,7 +859,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c
char ** dest_ptrs = nullptr;
int graph_cpynode_index = -1;
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
dest_ptrs = ctx.cur_graph->dest_ptrs_d;
graph_cpynode_index = ctx.cur_graph->graph_cpynode_index;
}
@@ -874,7 +874,7 @@ bool ggml_cuda_concat_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * c
}
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
if(ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
if(ctx.cur_graph && ctx.cur_graph->use_cpy_indirection && !disable_indirection) {
ctx.cur_graph->graph_cpynode_index = graph_cpynode_index;
}
#endif

View File

@@ -4627,7 +4627,7 @@ struct llama_context * llama_new_context_with_model(
// LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU
auto params = cparams.cuda_params;
std::string new_params;
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
if (false && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
static const std::string extra_string{"graphs=0"};
if (params) new_params = std::string{(const char *)params} + ',';
new_params += extra_string;