Add vision support in llama-server (#901)

* server: add support for vision model webui: add support for vision model * server : remove hack for extra parallel slot#10187 * llama : fix KV shift for qwen2vl #13870 * add no-context-shift parameter --------- Co-authored-by: firecoperana <firecoperana>
2026-01-26 17:20:01 +00:00 · 2025-11-05 08:43:46 +00:00
parent 92607d44c4
commit 7978f04996
26 changed files with 2456 additions and 729 deletions
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -57,8 +57,6 @@ add_library(${TARGET} STATIC
    chat-parser.cpp
    chat-parser.h
    common.cpp
-    chat.h
-    chat.cpp
    sampling.h
    sampling.cpp
    console.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -270,6 +270,14 @@ static std::string parse_device_list(const std::string& value) {
    return value;
 }

+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
+    if (!url.empty()) {
+        throw std::runtime_error("error: built without CURL, cannot download file from the internet");
+    }
+    return {};
+}
+
 //
 // CLI argument parsing
 //
@@ -1727,6 +1735,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_junk = std::stoi(argv[i]);
        return true;
    }
+    if (arg == "--no-context-shift") {
+        CHECK_ARG
+        params.ctx_shift = false;
+        return true;
+    }
    if (arg == "--pos") {
        CHECK_ARG
        params.i_pos = std::stoi(argv[i]);
@@ -2060,7 +2073,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
-
+    options.push_back({ "*",           "       --no-context-shift",           "disable context-shift." });
    options.push_back({ "backend" });
    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });

@@ -3311,6 +3324,29 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

+std::vector<llama_token> llama_tokenize(
+    const struct llama_vocab* vocab,
+    const std::string& text,
+    bool   add_special,
+    bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    }
+    else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::string piece;
    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
@@ -3343,7 +3379,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
    return piece;
 }

-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
    std::string text;
    text.resize(std::max(text.capacity(), tokens.size()));
    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -3359,6 +3395,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
    return text;
 }

+
 bool llama_should_add_bos_token(const llama_model * model) {
    const int add_bos = llama_add_bos_token(model);

--- a/common/common.h
+++ b/common/common.h
@@ -53,6 +53,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };

+using llama_tokens = std::vector<llama_token>;
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -237,7 +239,7 @@ struct gpt_params {
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
-
+    bool ctx_shift         = true;
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@@ -371,6 +373,9 @@ struct gpt_params {
    bool sweep_bench_output_jsonl = false;
 };

+
+
+void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);

@@ -381,6 +386,15 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_params_get_system_info(const gpt_params & params);

+
+struct common_remote_params {
+    std::vector<std::string> headers;
+    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
+    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
+};
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
+
 //
 // String utils
 //
@@ -497,6 +511,12 @@ std::vector<llama_token> llama_tokenize(
                        bool   add_special,
                        bool   parse_special = false);

+std::vector<llama_token> llama_tokenize(
+    const struct llama_vocab* vocab,
+    const std::string& text,
+    bool   add_special,
+    bool   parse_special = false);
+
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
@@ -513,70 +533,16 @@ std::string llama_token_to_piece(
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string llama_detokenize(
-                         llama_context * ctx,
+        const llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+
 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);

-//
-// Chat template utils
-//
-//struct common_tool_call {
-//    std::string name;
-//    std::string arguments;
-//    std::string id;
-//};
-//
-//// same with llama_chat_message, but uses std::string
-//struct common_chat_msg {
-//    std::string role;
-//    std::string content;
-//    std::vector<common_tool_call> tool_calls;
-//    std::string reasoning_content = "";
-//};

-//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
-//
-//namespace minja {
-//    class chat_template;
-//}
-//
-//typedef minja::chat_template common_chat_template;
-//
-//struct common_chat_templates {
-//    bool has_explicit_template; // Model had builtin template or template overridde was specified.
-//    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
-//    std::unique_ptr<common_chat_template> template_tool_use;
-//};
-//
-//
-//// CPP wrapper for llama_chat_apply_template
-//// If the built-in template is not supported, we default to chatml
-//// If the custom "tmpl" is not supported, we throw an error
-//std::string llama_chat_apply_template(
-//    const struct llama_model* model,
-//    const common_chat_template& tmpl,
-//    const std::vector< common_chat_msg>& chat,
-//    bool add_ass,
-//    bool use_jinja);
-//
-//// Format single message, while taking into account the position of that message in chat history
-//std::string  llama_chat_format_single(const struct llama_model* model,
-//    const common_chat_template& tmpl,
-//    const std::vector< common_chat_msg>& past_msg,
-//    const  common_chat_msg& new_msg,
-//    bool add_ass,
-//    bool use_jinja);
-//
-//// Returns an example of formatted chat
-//std::string  llama_chat_format_example(const struct llama_model* model,
-//    const common_chat_template& tmpl, bool use_jinja);
-//
-//common_chat_templates  llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);


 //