Server: rename functions and refactor code

rename functions refactor update slots rename params_base rename timings
2026-04-25 17:09:22 +00:00 · 2026-01-13 12:02:58 -06:00
parent cb1063f6cd
commit b43b22b68a
39 changed files with 609 additions and 595 deletions
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -264,13 +264,13 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > 0) {
        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
            }
            LOG_TEE("'\n");
        }
@@ -349,7 +349,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams);
+    struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams);

    while (n_remain != 0 || params.interactive) {
        // predict
@@ -385,8 +385,8 @@ int main(int argc, char ** argv) {
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);

-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_memory_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_memory_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

                n_past -= n_discard;

@@ -421,9 +421,9 @@ int main(int argc, char ** argv) {
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
+            const llama_token id = common_sampler_sample(ctx_sampling, ctx, nullptr);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            common_sampler_accept(ctx_sampling, ctx, id, true);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -456,7 +456,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
@@ -479,7 +479,7 @@ int main(int argc, char ** argv) {
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
                fflush(stdout);
                printf("\n");
@@ -601,7 +601,7 @@ int main(int argc, char ** argv) {
                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                    }

                    n_remain -= line_inp.size();
@@ -615,7 +615,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
+                    common_sampler_reset(llama_get_model_vocab(model), ctx_sampling);
                }
                is_interacting = false;
            }
@@ -634,7 +634,7 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        printf("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
        fflush(stdout);
    }

@@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    common_sampler_free(ctx_sampling);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS