Refactor chat and server file (#1062)

* Add alternative log functions * chat: fix int overflow, prevent size calculation in float/double (#17357) * chat: fix int overflow, prevent size calculation in float/double * Update common/chat.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * common : move all common_chat_parse_* to chat-parser.cpp. (#17481) # Conflicts: # common/chat.cpp * server: split server.cpp code into server/common/task/queue/context * Fix compiler warning * Clean up code * common: use native MultiByteToWideChar * move server prompt to server task * Clean code * delete utils.hpp --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Xuan-Son Nguyen <son@huggingface.co> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: DAN™ <dranger003@gmail.com>
2026-02-22 22:24:11 +00:00 · 2025-12-15 01:27:20 -06:00
parent 7b03c9dcef
commit 0e91b89cd3
20 changed files with 6849 additions and 5613 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2726,11 +2726,29 @@ bool fs_validate_filename(const std::string & filename) {
    return true;
 }

+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string& str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);

    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -3586,175 +3604,6 @@ bool llama_should_add_bos_token(const llama_model * model) {
    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
 }

-//
-// Chat template utils
-//
-//
-//bool llama_chat_verify_template(const struct llama_model* model, const std::string& tmpl, bool use_jinja) {
-//    if (use_jinja) {
-//        try {
-//            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
-//            common_chat_inputs inputs;
-//            inputs.messages = json::array({ {
-//                {"role", "user"},
-//                {"content", "test"},
-//            } });
-//            common_chat_params_init(chat_template, inputs);
-//            return true;
-//        }
-//        catch (const std::exception& e) {
-//            fprintf(stdout,"%s: failed to apply template: %s\n", __func__, e.what());
-//            return false;
-//        }
-//    }
-//    llama_chat_message chat[] = { {"user", "test"} };
-//    const int res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
-//    return res >= 0;
-//}
-
-//std::string llama_chat_apply_template(const struct llama_model * model,
-//    const common_chat_template& tmpl,
-//    const std::vector<common_chat_msg> & msgs,
-//    bool add_ass,
-//    bool use_jinja) {
-//    if (use_jinja) {
-//        auto messages = json::array();
-//        for (const auto& msg : msgs) {
-//            messages.push_back({ {"role", msg.role}, {"content", msg.content} });
-//        }
-//        common_chat_inputs inputs;
-//        inputs.messages = messages;
-//        inputs.add_generation_prompt = add_ass;
-//        return common_chat_params_init(tmpl, inputs).prompt;
-//    }
-//    int alloc_size = 0;
-//    std::vector<llama_chat_message> chat;
-//    for (auto & msg : msgs) {
-//        chat.push_back({msg.role.c_str(), msg.content.c_str()});
-//        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
-//    }
-//
-//    std::vector<char> buf(alloc_size);
-//
-//    // run the first time to get the total output length
-//    int32_t res = llama_chat_apply_template(model, tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-//    // error: chat template is not supported
-//    if (res < 0) {
-//        // if the custom "tmpl" is not supported, we throw an error
-//        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-//        throw std::runtime_error("this custom template is not supported");
-//    }
-//
-//    // if it turns out that our buffer is too small, we resize it
-//    if ((size_t)res > buf.size()) {
-//        buf.resize(res);
-//        res = llama_chat_apply_template(model, tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-//    }
-//
-//    std::string formatted_chat(buf.data(), res);
-//    return formatted_chat;
-//}
-////
-//std::string llama_chat_format_single(const struct llama_model * model,
-//    const common_chat_template& tmpl,
-//    const std::vector<common_chat_msg> & past_msg,
-//        const common_chat_msg & new_msg,
-//    bool add_ass,
-//    bool use_jinja) {
-//    std::ostringstream ss;
-//    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false, use_jinja);
-//    std::vector<common_chat_msg> chat_new(past_msg);
-//    // if the past_msg ends with a newline, we must preserve it in the formatted version
-//    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
-//        ss << "\n";
-//    };
-//    // format chat with new_msg
-//    chat_new.push_back(new_msg);
-//    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass, use_jinja);
-//    // get the diff part
-//    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-//    return ss.str();
-//}
-
-//std::string llama_chat_format_example(const struct llama_model * model, const common_chat_template& tmpl, bool use_jinja) {
-//    std::vector<common_chat_msg> msgs = {
-//        {"system",    "You are a helpful assistant", {}},
-//        {"user",      "Hello", {}},
-//        {"assistant", "Hi there", {}},
-//        {"user",      "How are you?", {}},
-//    };
-//    return llama_chat_apply_template(model, tmpl, msgs, true, use_jinja);
-//}
-//
-//#define CHATML_TEMPLATE_SRC \
-//    "{%- for message in messages -%}\n" \
-//    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
-//    "{%- endfor -%}\n" \
-//    "{%- if add_generation_prompt -%}\n" \
-//    "  {{- '<|im_start|>assistant\n' -}}\n" \
-//    "{%- endif -%}"
-//
-//common_chat_templates llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override)
-//{
-//    std::string default_template_src;
-//    std::string template_tool_use_src;
-//    bool has_explicit_template = !chat_template_override.empty();
-//    if (chat_template_override.empty()) {
-//        auto str = llama_model_chat_template(model, /* name */ nullptr);
-//        if (str) {
-//            default_template_src = str;
-//            has_explicit_template = true;
-//        }
-//        str = llama_model_chat_template(model, /* name */ "tool_use");
-//        if (str) {
-//            template_tool_use_src = str;
-//            has_explicit_template = true;
-//        }
-//    }
-//    else {
-//        default_template_src = chat_template_override;
-//    }
-//    if (default_template_src.empty() || default_template_src == "chatml") {
-//        if (!template_tool_use_src.empty()) {
-//            default_template_src = template_tool_use_src;
-//        }
-//        else {
-//            default_template_src = CHATML_TEMPLATE_SRC;
-//        }
-//    }
-//    auto vocab = llama_model_get_vocab(model);
-//    const auto get_token = [&](llama_token token, const char* name, const char* jinja_variable_name) {
-//        if (token == LLAMA_TOKEN_NULL) {
-//            if (default_template_src.find(jinja_variable_name) != std::string::npos
-//                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
-//                fprintf(stdout, "%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
-//            }
-//            return std::string();
-//        }
-//        else {
-//            return llama_token_to_piece(model, token, true);
-//        }
-//    };
-//    auto token_bos = get_token(llama_token_bos_impl(*vocab), "BOS", "bos_token");
-//    auto token_eos = get_token(llama_token_eos_impl(*vocab), "EOS", "eos_token");
-//    try {
-//        return {
-//            has_explicit_template,
-//            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
-//            template_tool_use_src.empty()
-//                ? nullptr
-//                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
-//        };
-//    }
-//    catch (const std::exception& e) {
-//        LOG("%s: failed to parse chat template: %s\n", __func__, e.what());
-//        return {
-//            has_explicit_template,
-//            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
-//            nullptr,
-//        };
-//    }
-//}

 //
 // KV cache utils