Server: refactor and rename functions (#1151)

* Server: rename functions and refactor code rename functions refactor update slots rename params_base rename timings * change * Revert kv cache name changes * Revert 2 * fix test build error --------- Co-authored-by: firecoperana <firecoperana>
2026-02-01 12:09:54 +00:00 · 2026-01-18 00:16:57 -06:00
parent 7024fdbc72
commit d71a3ec315
38 changed files with 532 additions and 528 deletions
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -51,7 +51,7 @@ static std::vector<std::string> k_prompts = {
 struct client {
    ~client() {
        if (ctx_sampling) {
-            llama_sampling_free(ctx_sampling);
+            common_sampler_free(ctx_sampling);
        }
    }

@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(llama_get_model_vocab(model), params.sparams);
+        client.ctx_sampling = common_sampler_init(llama_get_model_vocab(model), params.sparams);
    }

    std::vector<llama_token> tokens_system;
@@ -190,7 +190,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
-            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
+            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
        }

        if (llama_decode(ctx, batch) != 0) {
@@ -214,7 +214,7 @@ int main(int argc, char ** argv) {
            llama_kv_cache_dump_view_seqs(kvc_view, 40);
        }

-        llama_batch_clear(batch);
+        common_batch_clear(batch);

        // decode any currently ongoing sequences
        for (auto & client : clients) {
@@ -224,7 +224,7 @@ int main(int argc, char ** argv) {

            client.i_batch = batch.n_tokens;

-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);

            client.n_decoded += 1;
        }
@@ -253,14 +253,14 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    llama_sampling_reset(llama_get_model_vocab(model), client.ctx_sampling);
+                    common_sampler_reset(llama_get_model_vocab(model), client.ctx_sampling);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                    }

                    // extract the logits only for the last token
@@ -341,9 +341,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+                const llama_token id = common_sampler_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);

-                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
+                common_sampler_accept(client.ctx_sampling, ctx, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -351,7 +351,7 @@ int main(int argc, char ** argv) {
                    client.t_start_gen = ggml_time_us();
                }

-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);

                client.response += token_str;
                client.sampled = id;