diff --git a/common/common.cpp b/common/common.cpp
index 6c1d4616..3673e654 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3054,7 +3054,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         }
-        llama_memory_clear(lctx);
+        llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
         llama_reset_timings(lctx);
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index b450e2a2..ff0e167f 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -239,7 +239,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
     result.reserve(params.n_draft);
 
     if (reuse_n == 0) {
-        llama_memory_clear(ctx_dft);
+        llama_kv_cache_clear(ctx_dft);
 
         prompt_dft.clear();
     } else {
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 676fda41..b33845bf 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_memory_clear(ctx);
+                llama_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_TEE("%s: llama_decode() failed\n", __func__);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 06b3f4c0..19553809 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -338,7 +338,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 7a7b24ed..1a97d234 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     const struct llama_model * model = llama_get_model(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
 
     // run model
     fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index ce32bdc2..f51792de 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -43,7 +43,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -98,7 +98,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
     const llama_model * mdl = llama_get_model(ctx);
     llama_token eos_token = llama_token_eos(mdl);
 
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index da123172..2e03a4a0 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -703,7 +703,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index fe239e24..e3d7eec9 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -2136,7 +2136,7 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         // warmup run
         if (params.warmup) {
@@ -2150,7 +2150,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_memory_clear(ctx);
+            llama_kv_cache_clear(ctx);
 
             uint64_t t_start = get_time_ns();
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 10c8a49f..5c5d2397 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -407,7 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -582,7 +582,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -951,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             return;
         }
 
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1228,7 +1228,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             return;
         }
 
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1601,7 +1601,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             return;
         }
 
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1787,7 +1787,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         }
 
         // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 59a1f13e..eb0b1ecc 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,7 +82,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
 
     // run model
     fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 46786d94..73537a5b 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_memory_clear(ctx3);
+        llama_kv_cache_clear(ctx3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp
index a3b781c8..3c4ff874 100644
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -1138,7 +1138,7 @@ void server_context::kv_cache_clear() {
     LOG_VERBOSE("clearing KV cache", {});
 
     // clear the entire KV cache
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
     clean_kv_cache = false;
 }
 
diff --git a/examples/sweep-bench/sweep-bench.cpp b/examples/sweep-bench/sweep-bench.cpp
index 60b2f265..449a0b66 100644
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -133,7 +133,7 @@ int main(int argc, char ** argv) {
     }
 
     common_batch_clear(batch);
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
 
     for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
         // clean up KV cache before generation
diff --git a/include/llama.h b/include/llama.h
index 801f097f..757473f3 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -763,7 +763,7 @@ extern "C" {
     LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_memory_clear(
+    LLAMA_API void llama_kv_cache_clear(
             struct llama_context * ctx);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
diff --git a/src/llama.cpp b/src/llama.cpp
index f042a751..fd254752 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5385,7 +5385,7 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
     return ctx->kv_self.used;
 }
 
-void llama_memory_clear(struct llama_context * ctx) {
+void llama_kv_cache_clear(struct llama_context * ctx) {
     llama_kv_cache_clear(ctx->kv_self);
 }
 
@@ -6039,7 +6039,7 @@ struct llama_data_read {
 
         if (!res) {
             if (seq_id == -1) {
-                llama_memory_clear(ctx);
+                llama_kv_cache_clear(ctx);
             } else {
                 llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
             }