Revert 2

2026-04-28 10:21:48 +00:00 · 2026-01-17 08:49:23 -06:00
parent 93abfb66ab
commit 2403d6ee02
15 changed files with 23 additions and 23 deletions
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {

                const auto t_pp_start = ggml_time_us();

-                llama_memory_clear(ctx);
+                llama_kv_cache_clear(ctx);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -338,7 +338,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }

 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    const struct llama_model * model = llama_get_model(ctx);

    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);

    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -43,7 +43,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        }

        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);
        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);

@@ -98,7 +98,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
    const llama_model * mdl = llama_get_model(ctx);
    llama_token eos_token = llama_token_eos(mdl);

-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -703,7 +703,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -2136,7 +2136,7 @@ int main(int argc, char ** argv) {

        test t(inst, lmodel, ctx);

-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        // warmup run
        if (params.warmup) {
@@ -2150,7 +2150,7 @@ int main(int argc, char ** argv) {
        }

        for (int i = 0; i < params.reps; i++) {
-            llama_memory_clear(ctx);
+            llama_kv_cache_clear(ctx);

            uint64_t t_start = get_time_ns();

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -407,7 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -582,7 +582,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -951,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            return;
        }

-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1228,7 +1228,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            return;
        }

-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1601,7 +1601,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            return;
        }

-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1787,7 +1787,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        }

        // clear the KV cache
-        llama_memory_clear(ctx);
+        llama_kv_cache_clear(ctx);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,7 +82,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke

 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);

    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);

        // erase whole kv
-        llama_memory_clear(ctx3);
+        llama_kv_cache_clear(ctx3);
        fprintf(stderr, "%s : kv cache cleared\n", __func__);

        // restore kv into seq 1
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -1138,7 +1138,7 @@ void server_context::kv_cache_clear() {
    LOG_VERBOSE("clearing KV cache", {});

    // clear the entire KV cache
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);
    clean_kv_cache = false;
 }

--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -133,7 +133,7 @@ int main(int argc, char ** argv) {
    }

    common_batch_clear(batch);
-    llama_memory_clear(ctx);
+    llama_kv_cache_clear(ctx);

    for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
        // clean up KV cache before generation