From dc28cadb65a736cab712a4dece2d5e74880385d1 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 22 Dec 2025 18:43:13 +0000 Subject: [PATCH] Simple async --- CMakeLists.txt | 2 +- ggml/src/ggml-backend.cpp | 33 +++++++++++++++++++++++++++++++-- tests/CMakeLists.txt | 2 +- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbb2c991..acf17f9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ include(CheckIncludeFileCXX) set(CMAKE_WARN_UNUSED_CLI YES) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 65739cf3..969dbd19 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #define IK_PRINT_TIMING 0 @@ -2088,6 +2089,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::array needs_sync{{true}}; std::array own_cpy{{false}}; + std::barrier barrier(sched->n_backends, [] () {}); + std::vector workers; + workers.reserve(sched->n_backends); + std::vector statuses(sched->n_backends, GGML_STATUS_SUCCESS); + if (sched->split_mode_graph) { auto tensor_size = [] (const ggml_tensor * t) { auto nbytes = ggml_nbytes(t); @@ -2168,6 +2174,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + auto compute = [sched, &needs_sync, &own_cpy, &barrier, &statuses] (int ith) { + struct ggml_backend_sched_split * splits = sched->splits; std::vector ids; @@ -2182,6 +2190,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + bool needs_barrier = split->n_inputs > 0 || split->graph.nodes[0]->op == GGML_OP_REDUCE; + + if (needs_barrier) { + barrier.arrive_and_wait(); + } + + if (ith == split_backend_id) { // copy the input tensors to the split backend ggml_backend_sched_copy_inputs(sched, split, needs_sync, ids, unique_ids, last_ids_tensor); @@ -2201,7 +2216,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s #endif enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { - return ec; + statuses[ith] = ec; + return; } } else { // similar to ggml_backend_compare_graph_backend @@ -2228,7 +2244,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { - return ec; + statuses[ith] = ec; + return; } // TODO: pass backend to the callback, then the user can decide if they want to synchronize @@ -2241,6 +2258,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s j0 = j1; } } + } + + if (needs_barrier) { + barrier.arrive_and_wait(); + } // record the event of this copy if (split->n_inputs > 0) { @@ -2249,6 +2271,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } } + }; + + for (int i = 0; i < sched->n_backends; ++i) workers.emplace_back(compute, i); + for (auto & w : workers) w.join(); + for (auto status : statuses) { + if (status != GGML_STATUS_SUCCESS) return status; + } sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 89d0ec26..d0334217 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -129,7 +129,7 @@ if (NOT WIN32) endif() llama_target_and_test(test-chat-parser.cpp) -llama_target_and_test(test-chat-template.cpp) +#llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-json-partial.cpp) llama_target_and_test(test-regex-partial.cpp)