Server: refactor and rename functions (#1151)

* Server: rename functions and refactor code rename functions refactor update slots rename params_base rename timings * change * Revert kv cache name changes * Revert 2 * fix test build error --------- Co-authored-by: firecoperana <firecoperana>
2026-02-22 14:14:32 +00:00 · 2026-01-18 00:16:57 -06:00
parent 7024fdbc72
commit d71a3ec315
38 changed files with 532 additions and 528 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -402,7 +402,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
@@ -410,14 +410,14 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], common_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > add_bos) {
            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
            }
            LOG_TEE("'\n");
        }
@@ -449,7 +449,7 @@ int main(int argc, char ** argv) {
                if (params.verbose_prompt) {
                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                    }
                }
            }
@@ -464,7 +464,7 @@ int main(int argc, char ** argv) {
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
@@ -474,7 +474,7 @@ int main(int argc, char ** argv) {
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
@@ -549,7 +549,7 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), sparams);
+    struct llama_sampling_context * ctx_sampling = common_sampler_init(llama_get_model_vocab(model), sparams);
    if (!ctx_sampling) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
@@ -750,9 +750,9 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = common_sampler_sample(ctx_sampling, ctx, ctx_guidance);

-            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+            common_sampler_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

@@ -773,7 +773,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+                common_sampler_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -785,7 +785,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);

                // Console/Stream Output
                fprintf(stdout, "%s", token_str.c_str());
@@ -877,7 +877,7 @@ int main(int argc, char ** argv) {
            // if current token is not EOG, we add it to current assistant message
            if (params.conversation && !waiting_for_first_input) {
                auto id = llama_sampling_last(ctx_sampling);
-                assistant_ss << llama_token_to_piece(ctx, id, false);
+                assistant_ss << common_token_to_piece(ctx, id, false);
            }

            if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
@@ -955,7 +955,7 @@ int main(int argc, char ** argv) {
                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                    }

                    // reset assistant message
@@ -973,7 +973,7 @@ int main(int argc, char ** argv) {
            if (n_past > 0 || waiting_for_first_input) {
                if (is_interacting) {
                    
-                    llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
+                    common_sampler_reset(llama_get_model_vocab(model), ctx_sampling);
                }
                is_interacting = false;
                waiting_for_first_input = false;
@@ -1006,7 +1006,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    common_sampler_free(ctx_sampling);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS