Tool calls support from mainline (#723)

* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-27 09:53:40 +00:00 · 2025-09-01 00:38:49 -05:00
parent 8de297b795
commit d7882c3cf8
87 changed files with 13581 additions and 2224 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -272,6 +272,7 @@ extern "C" {
        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
    };

+
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@@ -1113,6 +1114,23 @@ extern "C" {
    // Get list of built-in chat templates
    LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);

+    typedef void* llama_sampler_context_t;
+
+    // user code can implement the interface below in order to create custom llama_sampler
+    struct llama_sampler_i {
+        const char* (*name)  (const struct llama_sampler* smpl);                                 // can be NULL
+        void                   (*accept)(struct llama_sampler* smpl, llama_token token);              // can be NULL
+        void                   (*apply) (struct llama_sampler* smpl, llama_token_data_array* cur_p); // required
+        void                   (*reset) (struct llama_sampler* smpl);                                 // can be NULL
+        struct llama_sampler* (*clone) (const struct llama_sampler* smpl);                                 // can be NULL if ctx is NULL
+        void                   (*free)  (struct llama_sampler* smpl);                                 // can be NULL if ctx is NULL
+    };
+
+    struct llama_sampler {
+        struct llama_sampler_i* iface;
+        llama_sampler_context_t   ctx;
+    };
+
    //
    // Grammar
    //
@@ -1128,6 +1146,8 @@ extern "C" {
                                 size_t    n_rules,
                                 size_t    start_rule_index);

+    LLAMA_API void llama_grammar_init_lazy(struct llama_sampler_grammar * grammar);
+
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);

    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
@@ -1244,6 +1264,40 @@ extern "C" {
          llama_token_data_array * candidates_p,
                           float   top_n_sigma);

+
+LLAMA_API void                   llama_sampler_reset(struct llama_sampler* smpl);
+
+LLAMA_API struct llama_grammar* llama_sampler_init_grammar(
+    const struct llama_vocab* vocab,
+    const char* grammar_str,
+
+        const char* grammar_root);
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
+/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
+/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
+DEPRECATED(LLAMA_API struct llama_grammar* llama_sampler_init_grammar_lazy(
+    const struct llama_vocab* vocab,
+        const char* grammar_str,
+        const char* grammar_root,
+        const char** trigger_words,
+        size_t num_trigger_words,
+        const llama_token* trigger_tokens,
+        size_t num_trigger_tokens),
+    "use llama_sampler_init_grammar_lazy_patterns instead");
+
+
+/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+/// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+/// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+LLAMA_API struct llama_grammar* llama_sampler_init_grammar_lazy_patterns(
+    const struct llama_vocab* vocab,
+    const char* grammar_str,
+    const char* grammar_root,
+    const char** trigger_patterns,
+    size_t num_trigger_patterns,
+    const llama_token* trigger_tokens,
+    size_t num_trigger_tokens);
+
    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
    LLAMA_API struct llama_sampler_dry * llama_sampler_init_dry(
        const struct llama_vocab* model,