Server: rename functions and refactor code

rename functions refactor update slots rename params_base rename timings
2026-04-29 19:01:47 +00:00 · 2026-01-13 12:02:58 -06:00
parent cb1063f6cd
commit b43b22b68a
39 changed files with 609 additions and 595 deletions
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -14,11 +14,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

    for (uint64_t i = 0; i < sentences.size(); i++) {
-        llama_batch_clear(batch);
+        common_batch_clear(batch);

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = common_tokenize(mdl, input_string, true, false);

        const int32_t n_toks = inputs.size();

@@ -27,7 +27,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        // inputs.push_back(llama_token_eos(mdl));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = common_tokenize(mdl, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@@ -39,11 +39,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        // add input to batch (this increments n_tokens)
        for (int32_t j = 0; j < n_toks; j++) {
-            llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
        }

        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
+        llama_memory_clear(ctx);
        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);

@@ -98,20 +98,20 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
    const llama_model * mdl = llama_get_model(ctx);
    llama_token eos_token = llama_token_eos(mdl);

-    llama_kv_cache_clear(ctx);
+    llama_memory_clear(ctx);
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = common_tokenize(mdl, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
-        llama_batch_clear(bat);
+        common_batch_clear(bat);
        auto n_inputs = (int32_t)inputs.size();
        for (int32_t i = 0; i < n_inputs; i++) {
-            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
        }
        inputs.clear();

@@ -130,7 +130,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
            break;
        }

-        std::string piece = llama_token_to_piece(ctx, token);
+        std::string piece = common_token_to_piece(ctx, token);
        if (stream) {
            std::printf("%s", piece.c_str());
            std::fflush(stdout);