Avoid rebuild of GGML graph for each token (#98)

Introduces caching of GGML graph to avoid unnecessary full rebuild between each token.
KV cache parameters, which change with each token, are updated directly in cached GGML
graph. Can be disabled with GGML_DISABLE_GRAPH_CACHING environment variable.
This commit is contained in:
agray3
2024-10-20 07:36:16 +01:00
committed by GitHub
parent b091a3513e
commit d336410509
4 changed files with 161 additions and 13 deletions

View File

@@ -232,6 +232,12 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// Utility to query whether cached GGML graph is in use
GGML_API bool ggml_use_cached_graph(ggml_backend_sched_t sched);
// Set whether or not to use GGML graph caching
GGML_API void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value);
#ifdef __cplusplus
}

View File

@@ -597,6 +597,13 @@ extern "C" {
GGML_TENSOR_FLAG_PARAM = 4,
};
// Flag (used on GGML_OP_CPY nodes) on whether node is associated with K or V cache
enum ggml_kv_cache_flag {
GGML_KV_CACHE_FLAG_NONE = 0,
GGML_KV_CACHE_FLAG_K = 1,
GGML_KV_CACHE_FLAG_V = 2
};
// ggml object
struct ggml_object {
size_t offs;