diff --git a/common/common.cpp b/common/common.cpp index 6c1d4616..3673e654 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3054,7 +3054,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); } - llama_memory_clear(lctx); + llama_kv_cache_clear(lctx); llama_synchronize(lctx); llama_reset_timings(lctx); } diff --git a/common/speculative.cpp b/common/speculative.cpp index b450e2a2..ff0e167f 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -239,7 +239,7 @@ std::vector llama_speculative_gen_draft( result.reserve(params.n_draft); if (reuse_n == 0) { - llama_memory_clear(ctx_dft); + llama_kv_cache_clear(ctx_dft); prompt_dft.clear(); } else { diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 676fda41..b33845bf 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -155,7 +155,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_TEE("%s: llama_decode() failed\n", __func__); diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 06b3f4c0..19553809 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -338,7 +338,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 7a7b24ed..1a97d234 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -35,7 +35,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu const struct llama_model * model = llama_get_model(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // run model fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index ce32bdc2..f51792de 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -43,7 +43,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -98,7 +98,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo const llama_model * mdl = llama_get_model(ctx); llama_token eos_token = llama_token_eos(mdl); - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index da123172..2e03a4a0 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -703,7 +703,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fe239e24..e3d7eec9 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -2136,7 +2136,7 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // warmup run if (params.warmup) { @@ -2150,7 +2150,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 10c8a49f..5c5d2397 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -407,7 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -582,7 +582,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -951,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { return; } - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1228,7 +1228,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { return; } - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1601,7 +1601,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1787,7 +1787,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } // clear the KV cache - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 59a1f13e..eb0b1ecc 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -82,7 +82,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { // clear previous kv_cache values (irrelevant for embeddings) - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); // run model fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 46786d94..73537a5b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -199,7 +199,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_memory_clear(ctx3); + llama_kv_cache_clear(ctx3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index a3b781c8..3c4ff874 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -1138,7 +1138,7 @@ void server_context::kv_cache_clear() { LOG_VERBOSE("clearing KV cache", {}); // clear the entire KV cache - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); clean_kv_cache = false; } diff --git a/examples/sweep-bench/sweep-bench.cpp b/examples/sweep-bench/sweep-bench.cpp index 60b2f265..449a0b66 100644 --- a/examples/sweep-bench/sweep-bench.cpp +++ b/examples/sweep-bench/sweep-bench.cpp @@ -133,7 +133,7 @@ int main(int argc, char ** argv) { } common_batch_clear(batch); - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) { // clean up KV cache before generation diff --git a/include/llama.h b/include/llama.h index 801f097f..757473f3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -763,7 +763,7 @@ extern "C" { LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); // Clear the KV cache - both cell info is erased and KV data is zeroed - LLAMA_API void llama_memory_clear( + LLAMA_API void llama_kv_cache_clear( struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) diff --git a/src/llama.cpp b/src/llama.cpp index f042a751..fd254752 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5385,7 +5385,7 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { return ctx->kv_self.used; } -void llama_memory_clear(struct llama_context * ctx) { +void llama_kv_cache_clear(struct llama_context * ctx) { llama_kv_cache_clear(ctx->kv_self); } @@ -6039,7 +6039,7 @@ struct llama_data_read { if (!res) { if (seq_id == -1) { - llama_memory_clear(ctx); + llama_kv_cache_clear(ctx); } else { llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); }