Revert 2

2026-05-01 03:41:53 +00:00 · 2026-01-17 08:49:23 -06:00
parent 93abfb66ab
commit 2403d6ee02
15 changed files with 23 additions and 23 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3054,7 +3054,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        if (llama_model_has_decoder(model)) {
            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        }
-        llama_memory_clear(lctx);
+        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
        llama_reset_timings(lctx);
    }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -239,7 +239,7 @@ std::vector<llama_token> llama_speculative_gen_draft(
    result.reserve(params.n_draft);
    if (reuse_n == 0) {
-        llama_memory_clear(ctx_dft);
+        llama_kv_cache_clear(ctx_dft);
        prompt_dft.clear();
    } else {
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
                const auto t_pp_start = ggml_time_us();
-                llama_memory_clear(ctx);
+                llama_kv_cache_clear(ctx);
                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -338,7 +338,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    const struct llama_model * model = llama_get_model(ctx);
    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -43,7 +43,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        }
        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);
@@ -98,7 +98,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
    const llama_model * mdl = llama_get_model(ctx);
    llama_token eos_token = llama_token_eos(mdl);
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -703,7 +703,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -2136,7 +2136,7 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        // warmup run
        if (params.warmup) {
@@ -2150,7 +2150,7 @@ int main(int argc, char ** argv) {
        }
        for (int i = 0; i < params.reps; i++) {
-            llama_memory_clear(ctx);
+            llama_kv_cache_clear(ctx);
            uint64_t t_start = get_time_ns();
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -407,7 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -582,7 +582,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -951,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            return;
        }
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1228,7 +1228,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            return;
        }
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1601,7 +1601,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            return;
        }
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1787,7 +1787,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        }
        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,7 +82,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
        // erase whole kv
-        llama_memory_clear(ctx3);
+        llama_kv_cache_clear(ctx3);
        fprintf(stderr, "%s : kv cache cleared\n", __func__);
        // restore kv into seq 1
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -1138,7 +1138,7 @@ void server_context::kv_cache_clear() {
    LOG_VERBOSE("clearing KV cache", {});
    // clear the entire KV cache
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    clean_kv_cache = false;
 }
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -133,7 +133,7 @@ int main(int argc, char ** argv) {
    }
    common_batch_clear(batch);
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
        // clean up KV cache before generation
--- a/include/llama.h
+++ b/include/llama.h
@@ -763,7 +763,7 @@ extern "C" {
    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_memory_clear(
+    LLAMA_API void llama_kv_cache_clear(
            struct llama_context * ctx);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5385,7 +5385,7 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
    return ctx->kv_self.used;
 }
-void llama_memory_clear(struct llama_context * ctx) {
+void llama_kv_cache_clear(struct llama_context * ctx) {
    llama_kv_cache_clear(ctx->kv_self);
 }
@@ -6039,7 +6039,7 @@ struct llama_data_read {
        if (!res) {
            if (seq_id == -1) {
-                llama_memory_clear(ctx);
+                llama_kv_cache_clear(ctx);
            } else {
                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
            }