mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-01 01:24:08 +00:00
Tool calls support from mainline (#723)
* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -1,26 +1,90 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_sampling;
|
||||
|
||||
struct llama_grammar_parser {
|
||||
std::map<std::string, uint32_t> symbol_ids;
|
||||
|
||||
llama_grammar_rules rules;
|
||||
|
||||
llama_grammar_stack c_rules() const;
|
||||
|
||||
uint32_t get_symbol_id(const char* src, size_t len);
|
||||
uint32_t generate_symbol_id(const std::string& base_name);
|
||||
|
||||
void add_rule(uint32_t rule_id, const llama_grammar_rule& rule);
|
||||
|
||||
const char* parse_alternates(
|
||||
const char* src,
|
||||
const std::string& rule_name,
|
||||
uint32_t rule_id,
|
||||
bool is_nested);
|
||||
|
||||
const char* parse_sequence(
|
||||
const char* src,
|
||||
const std::string& rule_name,
|
||||
llama_grammar_rule& rule,
|
||||
bool is_nested);
|
||||
|
||||
const char* parse_rule(const char* src);
|
||||
|
||||
bool parse(const char* src);
|
||||
void print(FILE* file);
|
||||
};
|
||||
|
||||
struct llama_grammar_trigger_pattern {
|
||||
std::string pattern;
|
||||
std::regex regex;
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
const llama_grammar_rules rules;
|
||||
llama_grammar_stacks stacks;
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab* vocab;
|
||||
|
||||
const llama_grammar_rules rules; // TODO: shared ptr
|
||||
llama_grammar_stacks stacks;
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
|
||||
// lazy grammars wait for trigger words or tokens before constraining the sampling.
|
||||
// we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
|
||||
// (useful e.g. for tool_choice=required)
|
||||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<llama_grammar_trigger_pattern> trigger_patterns;
|
||||
// Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||
// string, and the grammar will be given the string from the first match group onwards.
|
||||
|
||||
};
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
// note: needed for tests (not great)
|
||||
struct llama_grammar* llama_grammar_init_impl(
|
||||
const llama_grammar_element** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index);
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index);
|
||||
struct llama_grammar* llama_grammar_init_impl(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root,
|
||||
bool lazy,
|
||||
const char** trigger_patterns,
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token* trigger_tokens,
|
||||
size_t num_trigger_tokens);
|
||||
|
||||
void llama_grammar_free_impl(struct llama_grammar * grammar);
|
||||
|
||||
@@ -37,3 +101,8 @@ void llama_grammar_accept_token_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
const struct llama_sampling * smpl,
|
||||
llama_token token);
|
||||
|
||||
|
||||
void llama_grammar_accept_str(
|
||||
struct llama_grammar* grammar,
|
||||
const std::string& piece);
|
||||
|
||||
@@ -37,6 +37,7 @@ void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG , __VA_ARGS__)
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
|
||||
@@ -1035,3 +1035,175 @@ struct llama_sampler_dry* llama_sampler_init_dry_impl(const struct llama_vocab&
|
||||
}
|
||||
|
||||
|
||||
// grammar
|
||||
|
||||
struct llama_sampler_grammar {
|
||||
const struct llama_vocab* vocab;
|
||||
|
||||
std::string grammar_str;
|
||||
std::string grammar_root;
|
||||
|
||||
struct llama_grammar* grammar;
|
||||
};
|
||||
|
||||
static const char* llama_sampler_grammar_name(const struct llama_sampler* /*smpl*/) {
|
||||
return "grammar";
|
||||
}
|
||||
|
||||
static void llama_sampler_grammar_accept_impl(struct llama_sampler* smpl, llama_token token) {
|
||||
auto* ctx = (llama_sampler_grammar*)smpl->ctx;
|
||||
if (ctx->grammar) {
|
||||
llama_grammar_accept_token_impl(ctx->grammar,ctx->vocab ,nullptr, token);
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_sampler_grammar_apply(struct llama_sampler* smpl, llama_token_data_array* cur_p) {
|
||||
auto* ctx = (llama_sampler_grammar*)smpl->ctx;
|
||||
if (ctx->grammar) {
|
||||
llama_grammar_sample_impl(ctx->grammar, ctx->vocab, nullptr, cur_p);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sampler_reset(struct llama_sampler* smpl) {
|
||||
if (smpl->iface->reset) {
|
||||
smpl->iface->reset(smpl);
|
||||
}
|
||||
}
|
||||
|
||||
// Fwd declare to break reset --> init_impl --> llama_sampler_grammar_i --> reset cycle.
|
||||
static struct llama_grammar* llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root,
|
||||
bool lazy,
|
||||
const char** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token* trigger_tokens,
|
||||
size_t num_trigger_tokens,
|
||||
const char** trigger_patterns,
|
||||
size_t num_trigger_patterns);
|
||||
|
||||
static void llama_sampler_grammar_reset(struct llama_sampler* smpl) {
|
||||
auto* ctx = (llama_sampler_grammar*)smpl->ctx;
|
||||
if (!ctx->grammar) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<const char*> trigger_patterns_c;
|
||||
trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
|
||||
for (auto& trigger_pattern : ctx->grammar->trigger_patterns) {
|
||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
}
|
||||
auto* grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||
|
||||
llama_grammar_free_impl(ctx->grammar);
|
||||
ctx->grammar = grammar_new;
|
||||
}
|
||||
|
||||
//static struct llama_sampler* llama_sampler_grammar_clone(const struct llama_sampler* smpl) {
|
||||
// const auto* ctx = (const llama_sampler_grammar*)smpl->ctx;
|
||||
//
|
||||
// auto* result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
||||
//
|
||||
// // copy the state
|
||||
// {
|
||||
// auto* result_ctx = (llama_sampler_grammar*)result->ctx;
|
||||
//
|
||||
// if (ctx->grammar) {
|
||||
// result_ctx->grammar_str = ctx->grammar_str;
|
||||
// result_ctx->grammar_root = ctx->grammar_root;
|
||||
//
|
||||
// result_ctx->grammar = llama_grammar_copy_impl(ctx->grammar);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return result;
|
||||
//}
|
||||
|
||||
static void llama_sampler_grammar_free(struct llama_sampler* smpl) {
|
||||
const auto* ctx = (llama_sampler_grammar*)smpl->ctx;
|
||||
|
||||
if (ctx->grammar) {
|
||||
llama_grammar_free_impl(ctx->grammar);
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_grammar_i = {
|
||||
/* .name = */ llama_sampler_grammar_name,
|
||||
/* .accept = */ llama_sampler_grammar_accept_impl,
|
||||
/* .apply = */ llama_sampler_grammar_apply,
|
||||
/* .reset = */ llama_sampler_grammar_reset,
|
||||
/* .clone = */ NULL,
|
||||
/* .free = */ llama_sampler_grammar_free,
|
||||
};
|
||||
|
||||
struct llama_grammar* llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root,
|
||||
bool lazy,
|
||||
const char** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token* trigger_tokens,
|
||||
size_t num_trigger_tokens,
|
||||
const char** trigger_patterns,
|
||||
size_t num_trigger_patterns) {
|
||||
auto* ctx = new llama_sampler_grammar;
|
||||
struct llama_grammar* grammar;
|
||||
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
||||
// TODO: remove trigger_words support.
|
||||
if (trigger_words != nullptr && num_trigger_words > 0) {
|
||||
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
||||
std::string trigger_pattern("[\\s\\S]*?(");
|
||||
for (size_t i = 0; i < num_trigger_words; ++i) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
if (i > 0) {
|
||||
trigger_pattern += "|";
|
||||
}
|
||||
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
||||
}
|
||||
trigger_pattern += ")[\\s\\S]*";
|
||||
auto trigger_pattern_c = trigger_pattern.c_str();
|
||||
trigger_patterns = &trigger_pattern_c;
|
||||
num_trigger_patterns = 1;
|
||||
}
|
||||
grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
|
||||
}
|
||||
else {
|
||||
grammar = nullptr;
|
||||
}
|
||||
return grammar;
|
||||
}
|
||||
|
||||
struct llama_grammar* llama_sampler_init_grammar(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
|
||||
}
|
||||
|
||||
struct llama_grammar* llama_sampler_init_grammar_lazy(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root,
|
||||
const char** trigger_words,
|
||||
size_t num_trigger_words,
|
||||
const llama_token* trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
|
||||
}
|
||||
|
||||
struct llama_grammar* llama_sampler_init_grammar_lazy_patterns(
|
||||
const struct llama_vocab* vocab,
|
||||
const char* grammar_str,
|
||||
const char* grammar_root,
|
||||
const char** trigger_patterns,
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token* trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
|
||||
}
|
||||
|
||||
@@ -2340,7 +2340,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
// @ngxson : quick hack for gpt-oss, always render these tokens
|
||||
for (const auto & t : token_to_id) {
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
}
|
||||
}
|
||||
@@ -2387,6 +2387,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
if (has_return && has_call && has_end) {
|
||||
special_eog_ids.erase(end_id);
|
||||
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||
}
|
||||
}
|
||||
@@ -2468,7 +2469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
// set attributes by model/tokenizer/architecture name
|
||||
if (false
|
||||
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
||||
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
||||
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
|
||||
) {
|
||||
if (token_to_id.count("<mask>") == 0) {
|
||||
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
||||
|
||||
@@ -176,3 +176,6 @@ private:
|
||||
};
|
||||
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
|
||||
bool llama_token_is_eog(const struct llama_vocab* vocab, llama_token token);
|
||||
llama_token llama_token_bos(const struct llama_vocab* vocab);
|
||||
llama_token llama_token_eos(const struct llama_vocab* vocab);
|
||||
|
||||
@@ -22071,6 +22071,23 @@ struct llama_grammar * llama_grammar_init(
|
||||
void llama_grammar_free(struct llama_grammar * grammar) {
|
||||
llama_grammar_free_impl(grammar);
|
||||
}
|
||||
//
|
||||
//void llama_grammar_init_lazy(struct llama_sampler* smpl) {
|
||||
//
|
||||
// if (!grammar) {
|
||||
// return;
|
||||
// }
|
||||
// std::vector<const char*> trigger_patterns_c;
|
||||
// trigger_patterns_c.reserve(grammar.grammar->trigger_patterns.size());
|
||||
// for (auto& trigger_pattern : grammar.grammar->trigger_patterns) {
|
||||
// trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||
// }
|
||||
// //auto* grammar_new = llama_grammar_init_impl(grammar->vocab, "", "root",
|
||||
// // grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
// // grammar->trigger_tokens.data(), grammar->trigger_tokens.size());
|
||||
//
|
||||
//}
|
||||
|
||||
|
||||
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
||||
return llama_grammar_copy_impl(grammar);
|
||||
@@ -22198,6 +22215,7 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
struct llama_sampler_dry * llama_sampler_init_dry(const struct llama_vocab* vocab, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
return llama_sampler_init_dry_impl(*vocab, vocab->n_tokens(), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user