Add vision support in llama-server (#901)

* server: add support for vision model webui: add support for vision model * server : remove hack for extra parallel slot#10187 * llama : fix KV shift for qwen2vl #13870 * add no-context-shift parameter --------- Co-authored-by: firecoperana <firecoperana>
2026-01-26 17:20:01 +00:00 · 2025-11-05 08:43:46 +00:00
parent 92607d44c4
commit 7978f04996
26 changed files with 2456 additions and 729 deletions
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -57,8 +57,6 @@ add_library(${TARGET} STATIC
    chat-parser.cpp
    chat-parser.h
    common.cpp
-    chat.h
-    chat.cpp
    sampling.h
    sampling.cpp
    console.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -270,6 +270,14 @@ static std::string parse_device_list(const std::string& value) {
    return value;
 }

+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
+    if (!url.empty()) {
+        throw std::runtime_error("error: built without CURL, cannot download file from the internet");
+    }
+    return {};
+}
+
 //
 // CLI argument parsing
 //
@@ -1727,6 +1735,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_junk = std::stoi(argv[i]);
        return true;
    }
+    if (arg == "--no-context-shift") {
+        CHECK_ARG
+        params.ctx_shift = false;
+        return true;
+    }
    if (arg == "--pos") {
        CHECK_ARG
        params.i_pos = std::stoi(argv[i]);
@@ -2060,7 +2073,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
-
+    options.push_back({ "*",           "       --no-context-shift",           "disable context-shift." });
    options.push_back({ "backend" });
    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });

@@ -3311,6 +3324,29 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

+std::vector<llama_token> llama_tokenize(
+    const struct llama_vocab* vocab,
+    const std::string& text,
+    bool   add_special,
+    bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    }
+    else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::string piece;
    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
@@ -3343,7 +3379,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
    return piece;
 }

-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
    std::string text;
    text.resize(std::max(text.capacity(), tokens.size()));
    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -3359,6 +3395,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
    return text;
 }

+
 bool llama_should_add_bos_token(const llama_model * model) {
    const int add_bos = llama_add_bos_token(model);

--- a/common/common.h
+++ b/common/common.h
@@ -53,6 +53,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };

+using llama_tokens = std::vector<llama_token>;
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -237,7 +239,7 @@ struct gpt_params {
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
-
+    bool ctx_shift         = true;
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@@ -371,6 +373,9 @@ struct gpt_params {
    bool sweep_bench_output_jsonl = false;
 };

+
+
+void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_parse_from_env(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);

@@ -381,6 +386,15 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_params_get_system_info(const gpt_params & params);

+
+struct common_remote_params {
+    std::vector<std::string> headers;
+    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
+    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
+};
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
+
 //
 // String utils
 //
@@ -497,6 +511,12 @@ std::vector<llama_token> llama_tokenize(
                        bool   add_special,
                        bool   parse_special = false);

+std::vector<llama_token> llama_tokenize(
+    const struct llama_vocab* vocab,
+    const std::string& text,
+    bool   add_special,
+    bool   parse_special = false);
+
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
@@ -513,70 +533,16 @@ std::string llama_token_to_piece(
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string llama_detokenize(
-                         llama_context * ctx,
+        const llama_context * ctx,
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+
 // Uses the value from the model metadata if possible, otherwise
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);

-//
-// Chat template utils
-//
-//struct common_tool_call {
-//    std::string name;
-//    std::string arguments;
-//    std::string id;
-//};
-//
-//// same with llama_chat_message, but uses std::string
-//struct common_chat_msg {
-//    std::string role;
-//    std::string content;
-//    std::vector<common_tool_call> tool_calls;
-//    std::string reasoning_content = "";
-//};

-//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
-//
-//namespace minja {
-//    class chat_template;
-//}
-//
-//typedef minja::chat_template common_chat_template;
-//
-//struct common_chat_templates {
-//    bool has_explicit_template; // Model had builtin template or template overridde was specified.
-//    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
-//    std::unique_ptr<common_chat_template> template_tool_use;
-//};
-//
-//
-//// CPP wrapper for llama_chat_apply_template
-//// If the built-in template is not supported, we default to chatml
-//// If the custom "tmpl" is not supported, we throw an error
-//std::string llama_chat_apply_template(
-//    const struct llama_model* model,
-//    const common_chat_template& tmpl,
-//    const std::vector< common_chat_msg>& chat,
-//    bool add_ass,
-//    bool use_jinja);
-//
-//// Format single message, while taking into account the position of that message in chat history
-//std::string  llama_chat_format_single(const struct llama_model* model,
-//    const common_chat_template& tmpl,
-//    const std::vector< common_chat_msg>& past_msg,
-//    const  common_chat_msg& new_msg,
-//    bool add_ass,
-//    bool use_jinja);
-//
-//// Returns an example of formatted chat
-//std::string  llama_chat_format_example(const struct llama_model* model,
-//    const common_chat_template& tmpl, bool use_jinja);
-//
-//common_chat_templates  llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);


 //
--- a/examples/mtmd/clip.cpp
+++ b/examples/mtmd/clip.cpp
@@ -3331,7 +3331,7 @@ struct image_manipulation {
        dst.buf.resize(3 * target_width * target_height);

        float Cc;
-        float C[5];
+        float C[5] = {};
        float d0, d2, d3, a0, a1, a2, a3;
        int i, j, k, jj;
        int x, y;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -70,6 +70,9 @@ endif()
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

+target_include_directories(${TARGET} PRIVATE ../mtmd)
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -6,6 +6,9 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
+#include "base64.hpp"
+#include "mtmd.h"
+#include "mtmd-helper.h"
 #include "chat.h"
 #include <string>
 #include <vector>
@@ -51,6 +54,8 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)

+using raw_buffer = std::vector<uint8_t>;
+
 static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);

 template <typename T>
@@ -469,8 +474,9 @@ struct oaicompat_parser_options {
 // used by /chat/completions endpoint
 static json oaicompat_chat_params_parse(
    const struct llama_model* model,
-    const json& body, /* openai api json semantics */
-    const oaicompat_parser_options& opt)
+    json& body, /* openai api json semantics */
+    const oaicompat_parser_options& opt,
+    std::vector<raw_buffer>& out_files)
 {
    json llama_params;

@@ -480,20 +486,6 @@ static json oaicompat_chat_params_parse(
    auto stream = json_value(body, "stream", false);
    auto tool_choice = json_value(body, "tool_choice", std::string("auto"));

- /*   if (tools.is_array() && !tools.empty()) {
-        if (stream) {
-            throw std::runtime_error("Cannot use tools with stream");
-        }
-        if (!use_jinja) {
-            throw std::runtime_error("tools param requires --jinja flag");
-        }
-    }
-    if (!use_jinja) {
-        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
-            throw std::runtime_error("Unsupported param: tool_choice");
-        }
-    }*/
-
    if (!opt.use_jinja) {
        if (has_tools) {
            throw std::runtime_error("tools param requires --jinja flag");
@@ -531,8 +523,120 @@ static json oaicompat_chat_params_parse(
            json_schema = json_value(json_schema, "schema", json::object());
        }
    }
+
+    // get input files
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    json& messages = body.at("messages");
+    if (!messages.is_array()) {
+        throw std::runtime_error("Expected 'messages' to be an array");
+    }
+    for (auto& msg : messages) {
+        std::string role = json_value(msg, "role", std::string());
+        if (role != "assistant" && !msg.contains("content")) {
+            throw std::runtime_error("All non-assistant messages must contain 'content'");
+        }
+        if (role == "assistant") {
+            if (!msg.contains("content") && !msg.contains("tool_calls")) {
+                throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
+            }
+            if (!msg.contains("content")) {
+                continue; // avoid errors with no content
+            }
+        }
+        json& content = msg.at("content");
+        if (content.is_string() || content.is_null()) {
+            continue;
+        }
+
+        if (!content.is_array()) {
+            throw std::runtime_error("Expected 'content' to be a string or an array");
+        }
+
+        for (auto& p : content) {
+            std::string type = json_value(p, "type", std::string());
+            if (type == "image_url") {
+                if (!opt.allow_image) {
+                    throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json image_url = json_value(p, "image_url", json::object());
+                std::string url = json_value(image_url, "url", std::string());
+                if (string_starts_with(url, "http")) {
+                    // download remote image
+                    // TODO @ngxson : maybe make these params configurable
+                    common_remote_params params;
+                    params.headers.push_back("User-Agent: ik_llama.cpp/");
+                    params.max_size = 1024 * 1024 * 10; // 10MB
+                    params.timeout = 10; // seconds
+                    LOG_INFO("downloading image from '%s'\n", url.c_str());
+                    auto res = common_remote_get_content(url, params);
+                    if (200 <= res.first && res.first < 300) {
+                        LOG_INFO("downloaded %ld bytes\n", res.second.size());
+                        raw_buffer data;
+                        data.insert(data.end(), res.second.begin(), res.second.end());
+                        out_files.push_back(data);
+                    }
+                    else {
+                        throw std::runtime_error("Failed to download image");
+                    }
+
+                }
+                else {
+                    // try to decode base64 image
+                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                    if (parts.size() != 2) {
+                        throw std::runtime_error("Invalid image_url.url value");
+                    }
+                    else if (!string_starts_with(parts[0], "data:image/")) {
+                        throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
+                    }
+                    else if (!string_ends_with(parts[0], "base64")) {
+                        throw std::runtime_error("image_url.url must be base64 encoded");
+                    }
+                    else {
+                        auto base64_data = parts[1];
+                        auto decoded_data = base64_decode(base64_data);
+                        out_files.push_back(decoded_data);
+                    }
+                }
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("image_url");
+
+            }
+            else if (type == "input_audio") {
+                if (!opt.allow_audio) {
+                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json input_audio = json_value(p, "input_audio", json::object());
+                std::string data = json_value(input_audio, "data", std::string());
+                std::string format = json_value(input_audio, "format", std::string());
+                // while we also support flac, we don't allow it here so we matches the OAI spec
+                if (format != "wav" && format != "mp3") {
+                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                }
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("input_audio");
+
+            }
+            else if (type != "text") {
+                throw std::runtime_error("unsupported content[].type");
+            }
+        }
+    }
+
    common_chat_templates_inputs inputs;
-    inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
+    inputs.messages = common_chat_msgs_parse_oaicompat(messages);
    inputs.tools = common_chat_tools_parse_oaicompat(tools);
    inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
    inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
@@ -608,8 +712,9 @@ static json oaicompat_chat_params_parse(
    llama_params["grammar"] = chat_params.grammar;
    llama_params["grammar_lazy"] = chat_params.grammar_lazy;
    auto grammar_triggers = json::array();
-    for (const auto& trigger : chat_params.grammar_triggers) {
-        grammar_triggers.push_back(trigger.to_json<json>());
+    for (const auto & trigger : chat_params.grammar_triggers) {
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
    }
    llama_params["grammar_triggers"] = grammar_triggers;
    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
@@ -649,6 +754,52 @@ static json oaicompat_chat_params_parse(
    return llama_params;
 }

+
+//
+// tokenizer and input processing utils
+//
+
+static bool json_is_array_of_numbers(const json& data) {
+    if (data.is_array()) {
+        for (const auto& e : data) {
+            if (!e.is_number_integer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+// is array having BOTH numbers & strings?
+static bool json_is_array_of_mixed_numbers_strings(const json& data) {
+    bool seen_string = false;
+    bool seen_number = false;
+    if (data.is_array()) {
+        for (const auto& e : data) {
+            seen_string |= e.is_string();
+            seen_number |= e.is_number_integer();
+            if (seen_number && seen_string) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// does array have any individual integers/tokens?
+static bool json_is_array_and_contains_numbers(const json& data) {
+    if (data.is_array()) {
+        for (const auto& e : data) {
+            if (e.is_number_integer()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    return false;
+}
+
 // get value by path(key1 / key2)
 static json json_get_nested_values(const std::vector<std::string>& paths, const json& js) {
    json result = json::object();
@@ -673,6 +824,50 @@ static json json_get_nested_values(const std::vector<std::string>& paths, const
 }


+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+static std::vector<llama_token> tokenize_mixed(const llama_vocab* vocab, const json& json_prompt, bool add_special, bool parse_special) {
+    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+    // or the first element of the json_prompt array is a string.
+    std::vector<llama_token> prompt_tokens;
+
+    if (json_prompt.is_array()) {
+        bool first = true;
+        for (const auto& p : json_prompt) {
+            if (p.is_string()) {
+                auto s = p.template get<std::string>();
+
+                std::vector<llama_token> p;
+                if (first) {
+                    p = llama_tokenize(vocab, s, add_special, parse_special);
+                    first = false;
+                }
+                else {
+                    p = llama_tokenize(vocab, s, false, parse_special);
+                }
+
+                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+            }
+            else {
+                if (first) {
+                    first = false;
+                }
+
+                prompt_tokens.push_back(p.template get<llama_token>());
+            }
+        }
+    }
+    else {
+        auto s = json_prompt.template get<std::string>();
+        prompt_tokens = llama_tokenize(vocab, s, add_special, parse_special);
+    }
+
+    return prompt_tokens;
+}
+
 static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
    return json {
        {"tokens", tokens}
@@ -764,3 +959,480 @@ static token_probabilities get_token_probabilities(llama_context * ctx, int idx,

    return {sampled_token_p, cur};
 }
+
+/**
+ * server_tokens is a helper to manage the input tokens and image for the server.
+ * it is made this way to simplify the logic of KV cache management.
+ */
+struct server_tokens {
+    bool has_mtmd = false;
+
+private: // disallow accessing these members directly, risking out-of-sync
+
+    // map a **start** position in tokens to the image chunk
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
+
+    // list of tokens
+    // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
+    // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
+    // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
+    std::vector<llama_token> tokens;
+
+    // for ex. with input of 5 text tokens and 2 images:
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+    // pos  0   1   2   3   4   5      6      7      8      9
+    // map_pos_to_media will contain: {5, img0}, {8, img1}
+
+public:
+    server_tokens() = default;
+    ~server_tokens() = default;
+
+    // Prevent copying
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
+
+    // Allow moving (usually implicitly generated if members are movable)
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+
+    // Allow accessing elements using [] operator
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
+
+    server_tokens(mtmd::input_chunks& mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
+        for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
+            push_back(mtmd_chunks[i]);
+        }
+    }
+
+    server_tokens(std::vector<llama_token>& tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
+
+    llama_pos pos_next() const {
+        if (!has_mtmd) {
+            return tokens.size();
+        }
+
+        llama_pos res = tokens.size();
+
+        for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ++it) {
+            const auto& chunk = it->second;
+            res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+        }
+
+        return res;
+    }
+
+    // for debugging
+    std::string str() const {
+        std::ostringstream oss;
+        oss << "tokens: ";
+        for (const auto& t : tokens) {
+            if (t == LLAMA_TOKEN_NULL) {
+                oss << "<embd> ";
+            }
+            else {
+                oss << t << " ";
+            }
+        }
+        oss << "\n";
+        oss << "image pos: ";
+        for (const auto& it : map_pos_to_media) {
+            oss << it.first << ", ";
+        }
+        return oss.str();
+    }
+
+    const mtmd::input_chunk_ptr& find_chunk(llama_pos pos) const {
+        auto it = map_pos_to_media.find(pos);
+        if (it != map_pos_to_media.end()) {
+            return it->second;
+        }
+        else {
+            throw std::runtime_error("Chunk not found");
+        }
+    }
+
+    void push_back(llama_token tok) {
+        if (tok == LLAMA_TOKEN_NULL) {
+            throw std::runtime_error("Invalid token");
+        }
+        tokens.emplace_back(tok);
+    }
+
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk* chunk) {
+        auto type = mtmd_input_chunk_get_type(chunk);
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            GGML_ASSERT(has_mtmd);
+            const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
+            fprintf(stdout, "n_pos: %d\n", n_pos);
+            llama_pos start_pos = tokens.size();
+            for (int i = 0; i < n_pos; ++i) {
+                tokens.emplace_back(LLAMA_TOKEN_NULL);
+            }
+            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+            map_pos_to_media[start_pos] = std::move(new_chunk);
+        }
+        else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            for (size_t i = 0; i < n_tokens; ++i) {
+                push_back(text_tokens[i]);
+            }
+        }
+        else {
+            GGML_ABORT("Invalid chunk type");
+        }
+    }
+
+    // appends server tokens, updates the media map. copies media chunks.
+    void push_back(server_tokens& tokens) {
+        size_t start_pos = size();
+        for (size_t i = 0; i < tokens.size(); i++) {
+            push_back(tokens[i]);
+        }
+        if (tokens.has_mtmd) {
+            // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
+            // We could also just check, but this will prevent silently dropping MTMD data.
+            GGML_ASSERT(has_mtmd);
+            for (auto it = tokens.map_pos_to_media.begin(); it != tokens.map_pos_to_media.end(); ) {
+                auto chunk = tokens.map_pos_to_media[it->first].get();
+                mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+                map_pos_to_media[start_pos + it->first] = std::move(new_chunk);
+            }
+        }
+    }
+
+    // for compatibility with context shift and prompt truncation
+    void insert(const std::vector<llama_token>& inp_tokens) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
+    }
+
+    // for compatibility with context shift and prompt truncation
+    void resize(size_t size) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens.resize(size);
+    }
+
+    llama_token * data() {
+        return tokens.data();
+    }
+
+    llama_tokens::iterator begin() {
+        return tokens.begin();
+    }
+
+    llama_tokens::iterator end() {
+       return tokens.end();
+    }
+
+    llama_tokens::const_iterator cbegin() {
+        return tokens.cbegin();
+    }
+
+    llama_tokens::const_iterator cend() {
+        return tokens.cend();
+    }
+
+    llama_tokens tokens_data() {
+
+        return tokens;
+    }
+
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const std::vector<llama_token>& get_text_tokens() const {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        return tokens;
+    }
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens[pos] = id;
+    }
+
+    size_t size() const {
+        return tokens.size();
+    }
+
+    bool empty() const {
+        return tokens.empty();
+    }
+
+    void clear() {
+        tokens.clear();
+    }
+
+    void keep_first(size_t n) {
+        GGML_ASSERT(n <= tokens.size());
+        if (has_mtmd) {
+            if (n == tokens.size()) {
+                return; // nothing to do
+            }
+            // we throw an error if we try to remove a token in the middle of an image
+            // for ex. with input of 5 text tokens and 2 images:
+            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+            // n  1   2   3   4   5   6      7      8      9      10
+            // allowed to resize      ^                    ^
+            // disallowed to resize          ^      ^             ^
+            if (n > 0) {
+                llama_token last_token = tokens[n - 1];
+                // make sure we never remove tokens in the middle of an image
+                if (last_token == LLAMA_TOKEN_NULL) {
+                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                }
+            }
+            // remove all image chunks that are not used anymore
+            for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
+                llama_pos pos = it->first;
+                if (pos >= (llama_pos)n) {
+                    it = map_pos_to_media.erase(it);
+                }
+                else {
+                    ++it;
+                }
+            }
+        }
+        tokens.resize(n);
+    }
+
+    std::string detokenize(const llama_context* ctx, bool special) const {
+        llama_tokens text_tokens;
+        text_tokens.reserve(tokens.size());
+        for (const auto& t : tokens) {
+            if (t != LLAMA_TOKEN_NULL) {
+                text_tokens.push_back(t);
+            }
+        }        
+        return llama_detokenize(ctx, text_tokens, special);
+    }
+
+    size_t get_common_prefix(const server_tokens& b) const {
+        size_t max_idx = std::min(tokens.size(), b.tokens.size());
+        for (size_t i = 0; i < max_idx; ++i) {
+            auto& ai = tokens[i];
+            auto& bi = b.tokens[i];
+
+            if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
+                GGML_ASSERT(has_mtmd);
+                const auto& a_chunk = find_chunk(i);
+                const auto& b_chunk = b.find_chunk(i);
+                GGML_ASSERT(a_chunk && b_chunk);
+                std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
+                std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
+                size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
+                size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
+                if (ai_id == bi_id && a_pos == b_pos) {
+                    GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
+                    i += a_pos - 1; // will be +1 by the for loop
+                    continue;
+                }
+                else {
+                    return i;
+                }
+            }
+            else if (ai == bi) {
+                continue;
+            }
+            else {
+                return i;
+            }
+        }
+        return max_idx; // all tokens are equal
+    }
+
+    // make sure all text tokens are within the vocab range
+    bool validate(const struct llama_context* ctx) const {
+        const llama_model* model = llama_get_model(ctx);
+        const llama_vocab* vocab = llama_model_get_vocab(model);
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+
+        for (size_t i = 0; i < tokens.size(); ++i) {
+            auto& t = tokens[i];
+            if (t == LLAMA_TOKEN_NULL) {
+                try {
+                    const auto& chunk = find_chunk(i);
+                    size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
+                    i += n_pos - 1; // will be +1 by the for loop
+                }
+                catch (const std::exception& e) {
+                    return false;
+                }
+            }
+            else if (t < 0 || t >= n_vocab) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // encode and decode the image chunk
+    int32_t process_chunk(
+        llama_context* ctx,
+        mtmd_context* mctx,
+        llama_pos n_past,
+        int32_t seq_id,
+        llama_pos& n_pos_out) {
+        auto& chunk = find_chunk(n_past);
+        const char* name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+            ? "image" : "audio";
+        LOG_INFO("processing %s...\n", name);
+        int32_t n_batch = llama_n_batch(ctx);
+        int64_t t0 = ggml_time_ms();
+        llama_pos new_n_past = n_past;
+        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+            chunk.get(),
+            n_past,
+            seq_id,
+            n_batch,
+            true, // logits last
+            &new_n_past);
+        LOG_INFO("processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        if (result != 0) {
+            LOG_ERROR("mtmd_helper_eval failed with status %d", result);
+            n_pos_out = n_past;
+            return result;
+        }
+        n_pos_out = new_n_past;
+        return 0;
+    }
+};
+
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t* data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return std::to_string(hash);
+}
+
+static server_tokens process_mtmd_prompt(mtmd_context* mctx, std::string prompt, std::vector<raw_buffer> files) {
+    mtmd::bitmaps bitmaps;
+    for (auto& file : files) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
+        if (!bmp.ptr) {
+            throw std::runtime_error("Failed to load image or audio file");
+        }
+        // calculate bitmap hash (for KV caching)
+        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
+        bmp.set_id(hash.c_str());
+        bitmaps.entries.push_back(std::move(bmp));
+    }
+    // process prompt
+    std::vector<server_tokens> inputs;
+    // multimodal
+    mtmd_input_text inp_txt = {
+        prompt.c_str(),
+        /* add_special */   true,
+        /* parse_special */ true,
+    };
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = bitmaps.c_ptr();
+    int32_t tokenized = mtmd_tokenize(mctx,
+        chunks.ptr.get(),
+        &inp_txt,
+        bitmaps_c_ptr.data(),
+        bitmaps_c_ptr.size());
+    if (tokenized != 0) {
+        throw std::runtime_error("Failed to tokenize prompt");
+    }
+    auto result = server_tokens(chunks, true);
+    return result;
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * use tokenize_input_prompts() if the input could be an array.
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ */
+static server_tokens tokenize_input_subprompt(const llama_vocab* vocab, mtmd_context* mctx, const json& json_prompt, bool add_special, bool parse_special) {
+    constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
+    constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
+    const bool has_mtmd = mctx != nullptr;
+    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
+        // string or mixed
+        std::vector<llama_token> tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
+        return server_tokens(tmp, false);
+    }
+    else if (json_is_array_of_numbers(json_prompt)) {
+        // array of tokens
+        std::vector<llama_token> tmp = json_prompt.get<std::vector<llama_token>>();
+        return server_tokens(tmp, false);
+    }
+    else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
+        // JSON object with prompt key.
+        if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
+            if (!has_mtmd)
+                throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
+
+            // JSON object with prompt and multimodal key.
+            std::vector<raw_buffer> files;
+            for (const auto& entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
+                files.push_back(base64_decode(entry));
+            }
+            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
+        }
+        else {
+            // Not multimodal, but contains a subobject.
+            std::vector<llama_token> tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
+            return server_tokens(tmp, false);
+        }
+    }
+    else {
+        throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
+    }
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
+ */
+static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab* vocab, mtmd_context* mctx, const json& json_prompt, bool add_special, bool parse_special) {
+    std::vector<server_tokens> result;
+    if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
+        result.reserve(json_prompt.size());
+        for (const auto& p : json_prompt) {
+            result.push_back(tokenize_input_subprompt(vocab, mctx, p, add_special, parse_special));
+        }
+    }
+    else {
+        result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
+    }
+    if (result.empty()) {
+        throw std::runtime_error("\"prompt\" must not be empty");
+    }
+    return result;
+}
+// Assuming raw_buffer has .data() and .size() members
+inline void printFilesInfo(const std::vector<raw_buffer>& files) {
+    for (size_t i = 0; i < files.size(); ++i) {
+        const auto& file = files[i];
+        std::cout << "File " << i << ": Size = " << file.size() << " bytes\n";
+
+        // Print first 16 bytes in hex
+        std::cout << "First 16 bytes: ";
+        for (size_t j = 0; j < std::min<size_t>(file.size(), 16); ++j) {
+            std::cout << std::hex << std::setw(2) << std::setfill('0')
+                << static_cast<int>(file.data()[j]) << " ";
+        }
+        std::cout << std::dec << "\n\n"; // Reset to decimal
+    }
+}
--- a/examples/server/webui/dist/index.html
+++ b/examples/server/webui/dist/index.html
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@@ -19,9 +19,11 @@
        "dexie-export-import": "^4.0.11",
        "highlight.js": "^11.10.0",
        "katex": "^0.16.15",
+        "pdfjs-dist": "^5.2.133",
        "postcss": "^8.4.49",
        "react": "^18.3.1",
        "react-dom": "^18.3.1",
+        "react-dropzone": "^14.3.8",
        "react-hot-toast": "^2.5.2",
        "react-markdown": "^9.0.3",
        "react-router": "^7.1.5",
@@ -1036,6 +1038,191 @@
        "@jridgewell/sourcemap-codec": "^1.4.14"
      }
    },
+    "node_modules/@napi-rs/canvas": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz",
+      "integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==",
+      "license": "MIT",
+      "optional": true,
+      "workspaces": [
+        "e2e/*"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "optionalDependencies": {
+        "@napi-rs/canvas-android-arm64": "0.1.80",
+        "@napi-rs/canvas-darwin-arm64": "0.1.80",
+        "@napi-rs/canvas-darwin-x64": "0.1.80",
+        "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.80",
+        "@napi-rs/canvas-linux-arm64-gnu": "0.1.80",
+        "@napi-rs/canvas-linux-arm64-musl": "0.1.80",
+        "@napi-rs/canvas-linux-riscv64-gnu": "0.1.80",
+        "@napi-rs/canvas-linux-x64-gnu": "0.1.80",
+        "@napi-rs/canvas-linux-x64-musl": "0.1.80",
+        "@napi-rs/canvas-win32-x64-msvc": "0.1.80"
+      }
+    },
+    "node_modules/@napi-rs/canvas-android-arm64": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.80.tgz",
+      "integrity": "sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-darwin-arm64": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.80.tgz",
+      "integrity": "sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-darwin-x64": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.80.tgz",
+      "integrity": "sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.80.tgz",
+      "integrity": "sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm64-gnu": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.80.tgz",
+      "integrity": "sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm64-musl": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.80.tgz",
+      "integrity": "sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.80.tgz",
+      "integrity": "sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-x64-gnu": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.80.tgz",
+      "integrity": "sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-x64-musl": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.80.tgz",
+      "integrity": "sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@napi-rs/canvas-win32-x64-msvc": {
+      "version": "0.1.80",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.80.tgz",
+      "integrity": "sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
    "node_modules/@nodelib/fs.scandir": {
      "version": "2.1.5",
      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -2060,6 +2247,15 @@
      "dev": true,
      "license": "Python-2.0"
    },
+    "node_modules/attr-accept": {
+      "version": "2.2.5",
+      "resolved": "https://registry.npmjs.org/attr-accept/-/attr-accept-2.2.5.tgz",
+      "integrity": "sha512-0bDNnY/u6pPwHDMoF0FieU354oBi0a8rD9FcsLwzcGWbc8KS8KPIi7y+s13OlVY+gMWc/9xEMUgNE6Qm8ZllYQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
    "node_modules/autoprefixer": {
      "version": "10.4.20",
      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
@@ -2815,6 +3011,18 @@
        "node": ">=16.0.0"
      }
    },
+    "node_modules/file-selector": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/file-selector/-/file-selector-2.1.2.tgz",
+      "integrity": "sha512-QgXo+mXTe8ljeqUFaX3QVHc5osSItJ/Km+xpocx0aSqWGMSCf6qYs/VnzZgS864Pjn5iceMRFigeAV7AfTlaig==",
+      "license": "MIT",
+      "dependencies": {
+        "tslib": "^2.7.0"
+      },
+      "engines": {
+        "node": ">= 12"
+      }
+    },
    "node_modules/fill-range": {
      "version": "7.1.1",
      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
@@ -4694,6 +4902,15 @@
        "node": ">=0.10.0"
      }
    },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
    "node_modules/optionator": {
      "version": "0.9.4",
      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4814,6 +5031,18 @@
        "node": ">=8"
      }
    },
+    "node_modules/pdfjs-dist": {
+      "version": "5.4.149",
+      "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.149.tgz",
+      "integrity": "sha512-Xe8/1FMJEQPUVSti25AlDpwpUm2QAVmNOpFP0SIahaPIOKBKICaefbzogLdwey3XGGoaP4Lb9wqiw2e9Jqp0LA==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=20.16.0 || >=22.3.0"
+      },
+      "optionalDependencies": {
+        "@napi-rs/canvas": "^0.1.77"
+      }
+    },
    "node_modules/picocolors": {
      "version": "1.1.1",
      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
@@ -4892,6 +5121,17 @@
        "url": "https://github.com/prettier/prettier?sponsor=1"
      }
    },
+    "node_modules/prop-types": {
+      "version": "15.8.1",
+      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.4.0",
+        "object-assign": "^4.1.1",
+        "react-is": "^16.13.1"
+      }
+    },
    "node_modules/property-information": {
      "version": "6.5.0",
      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
@@ -4958,6 +5198,23 @@
        "react": "^18.3.1"
      }
    },
+    "node_modules/react-dropzone": {
+      "version": "14.3.8",
+      "resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz",
+      "integrity": "sha512-sBgODnq+lcA4P296DY4wacOZz3JFpD99fp+hb//iBO2HHnyeZU3FwWyXJ6salNpqQdsZrgMrotuko/BdJMV8Ug==",
+      "license": "MIT",
+      "dependencies": {
+        "attr-accept": "^2.2.4",
+        "file-selector": "^2.1.0",
+        "prop-types": "^15.8.1"
+      },
+      "engines": {
+        "node": ">= 10.13"
+      },
+      "peerDependencies": {
+        "react": ">= 16.8 || 18.0.0"
+      }
+    },
    "node_modules/react-hot-toast": {
      "version": "2.5.2",
      "resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.5.2.tgz",
@@ -4975,6 +5232,12 @@
        "react-dom": ">=16"
      }
    },
+    "node_modules/react-is": {
+      "version": "16.13.1",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
+      "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
+      "license": "MIT"
+    },
    "node_modules/react-markdown": {
      "version": "9.0.3",
      "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz",
@@ -5851,7 +6114,6 @@
      "version": "2.8.1",
      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
-      "devOptional": true,
      "license": "0BSD"
    },
    "node_modules/turbo-stream": {
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@@ -22,9 +22,11 @@
    "dexie-export-import": "^4.0.11",
    "highlight.js": "^11.10.0",
    "katex": "^0.16.15",
+    "pdfjs-dist": "^5.2.133",
    "postcss": "^8.4.49",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
+    "react-dropzone": "^14.3.8",
    "react-hot-toast": "^2.5.2",
    "react-markdown": "^9.0.3",
    "react-router": "^7.1.5",
--- a/examples/server/webui/src/Config.ts
+++ b/examples/server/webui/src/Config.ts
@@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
  showTokensPerSecond: false,
  showThoughtInProgress: false,
  excludeThoughtOnReq: true,
+  pasteLongTextToFileLen: 2500,
+  pdfAsImage: false,
  reasoning_format: 'auto',
  // make sure these default values are in sync with `common.h`
  samplers: 'dkypmxnt',
@@ -46,6 +48,8 @@ export const CONFIG_INFO: Record<string, string> = {
  reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
  apiKey: 'Set the API Key if you are using --api-key option for the server.',
  systemMessage: 'The starting message that defines how model should behave.',
+  pasteLongTextToFileLen:
+    'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
  samplers:
    'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->top_sigma->temperature',
  temperature:
--- a/examples/server/webui/src/components/ChatInputExtraContextItem.tsx
+++ b/examples/server/webui/src/components/ChatInputExtraContextItem.tsx
@@ -0,0 +1,135 @@
+import {
+  DocumentTextIcon,
+  SpeakerWaveIcon,
+  XMarkIcon,
+} from '@heroicons/react/24/outline';
+import { MessageExtra } from '../utils/types';
+import { useState } from 'react';
+import { classNames } from '../utils/misc';
+
+export default function ChatInputExtraContextItem({
+  items,
+  removeItem,
+  clickToShow,
+}: {
+  items?: MessageExtra[];
+  removeItem?: (index: number) => void;
+  clickToShow?: boolean;
+}) {
+  const [show, setShow] = useState(-1);
+  const showingItem = show >= 0 ? items?.[show] : undefined;
+
+  if (!items) return null;
+
+  return (
+    <div
+      className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
+      role="group"
+      aria-description="Selected files"
+    >
+      {items.map((item, i) => (
+        <div
+          className="indicator"
+          key={i}
+          onClick={() => clickToShow && setShow(i)}
+          tabIndex={0}
+          aria-description={
+            clickToShow ? `Click to show: ${item.name}` : undefined
+          }
+          role={clickToShow ? 'button' : 'menuitem'}
+        >
+          {removeItem && (
+            <div className="indicator-item indicator-top">
+              <button
+                aria-label="Remove file"
+                className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
+                onClick={() => removeItem(i)}
+              >
+                <XMarkIcon className="h-3 w-3" />
+              </button>
+            </div>
+          )}
+
+          <div
+            className={classNames({
+              'flex flex-row rounded-md shadow-sm items-center m-0 p-0': true,
+              'cursor-pointer hover:shadow-md': !!clickToShow,
+            })}
+          >
+            {item.type === 'imageFile' ? (
+              <>
+                <img
+                  src={item.base64Url}
+                  alt={`Preview image for ${item.name}`}
+                  className="w-14 h-14 object-cover rounded-md"
+                />
+              </>
+            ) : (
+              <>
+                <div
+                  className="w-14 h-14 flex items-center justify-center"
+                  aria-description="Document icon"
+                >
+                  {item.type === 'audioFile' ? (
+                    <SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
+                  ) : (
+                    <DocumentTextIcon className="h-8 w-8 text-gray-500" />
+                  )}
+                </div>
+
+                <div className="text-xs pr-4">
+                  <b>{item.name ?? 'Extra content'}</b>
+                </div>
+              </>
+            )}
+          </div>
+        </div>
+      ))}
+
+      {showingItem && (
+        <dialog
+          className="modal modal-open"
+          aria-description={`Preview ${showingItem.name}`}
+        >
+          <div className="modal-box">
+            <div className="flex justify-between items-center mb-4">
+              <b>{showingItem.name ?? 'Extra content'}</b>
+              <button
+                className="btn btn-ghost btn-sm"
+                aria-label="Close preview dialog"
+              >
+                <XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
+              </button>
+            </div>
+            {showingItem.type === 'imageFile' ? (
+              <img
+                src={showingItem.base64Url}
+                alt={`Preview image for ${showingItem.name}`}
+              />
+            ) : showingItem.type === 'audioFile' ? (
+              <audio
+                controls
+                className="w-full"
+                aria-description={`Audio file ${showingItem.name}`}
+              >
+                <source
+                  src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
+                  type={showingItem.mimeType}
+                  aria-description={`Audio file ${showingItem.name}`}
+                />
+                Your browser does not support the audio element.
+              </audio>
+            ) : (
+              <div className="overflow-x-auto">
+                <pre className="whitespace-pre-wrap break-words text-sm">
+                  {showingItem.content}
+                </pre>
+              </div>
+            )}
+          </div>
+          <div className="modal-backdrop" onClick={() => setShow(-1)}></div>
+        </dialog>
+      )}
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@@ -3,7 +3,8 @@ import { useAppContext } from '../utils/app.context';
 import { Message, PendingMessage } from '../utils/types';
 import { classNames } from '../utils/misc';
 import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
-import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline';
+import { ChevronLeftIcon, ChevronRightIcon,  ArrowPathIcon, PencilSquareIcon } from '@heroicons/react/24/outline';
+import ChatInputExtraContextItem from './ChatInputExtraContextItem';

 interface SplitMessage {
  content: PendingMessage['content'];
@@ -82,7 +83,11 @@ export default function ChatMessage({
  if (!viewingChat) return null;

  return (
-    <div className="group" id={id}>
+    <div className="group" 
+      id={id}       
+      role="group"
+      aria-description={`Message from ${msg.role}`}
+    >
      <div
        className={classNames({
          chat: true,
@@ -90,9 +95,13 @@ export default function ChatMessage({
          'chat-end': msg.role === 'user',
        })}
      >
+        {msg.extra && msg.extra.length > 0 && (
+          <ChatInputExtraContextItem items={msg.extra} clickToShow />
+        )}
+
        <div
          className={classNames({
-            'chat-bubble markdown': true,
+            'chat-bubble chat-bubble-primary': true,
            'chat-bubble-base-300': msg.role !== 'user',
          })}
        >
@@ -168,35 +177,6 @@ export default function ChatMessage({
                        </div>
                      </details>
                    )}
-
-                    {msg.extra && msg.extra.length > 0 && (
-                      <details
-                        className={classNames({
-                          'collapse collapse-arrow mb-4 bg-base-200': true,
-                          'bg-opacity-10': msg.role !== 'assistant',
-                        })}
-                      >
-                        <summary className="collapse-title">
-                          Extra content
-                        </summary>
-                        <div className="collapse-content">
-                          {msg.extra.map(
-                            (extra, i) =>
-                              extra.type === 'textFile' ? (
-                                <div key={extra.name}>
-                                  <b>{extra.name}</b>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : extra.type === 'context' ? (
-                                <div key={i}>
-                                  <pre>{extra.content}</pre>
-                                </div>
-                              ) : null // TODO: support other extra types
-                          )}
-                        </div>
-                      </details>
-                    )}
-
                    <MarkdownDisplay
                      content={content}
                      isGenerating={isPending}
@@ -273,7 +253,7 @@ export default function ChatMessage({
              onClick={() => setEditingContent(msg.content)}
              disabled={msg.content === null}
            >
-              ✍️ Edit
+              <PencilSquareIcon className="h-4 w-4" /> Edit
            </button>
          )}
          {/* assistant message */}
@@ -289,7 +269,7 @@ export default function ChatMessage({
                  }}
                  disabled={msg.content === null}
                >
-                  🔄 Regenerate
+                 <ArrowPathIcon className="h-4 w-4" /> Regenerate
                </button>
              )}
              {!isPending && (
@@ -298,7 +278,7 @@ export default function ChatMessage({
                  onClick={() => setEditingContent(msg.content)}
                  disabled={msg.content === null}
                >
-                   ✍️ Edit
+                   <PencilSquareIcon className="h-4 w-4" /> Edit
                </button>
              )}
            </>
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useMemo, useState } from 'react';
+import { ClipboardEvent, useEffect, useMemo, useState } from 'react';
 import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
 import ChatMessage from './ChatMessage';
 import { CanvasType, Message, PendingMessage } from '../utils/types';
@@ -7,7 +7,17 @@ import CanvasPyInterpreter from './CanvasPyInterpreter';
 import StorageUtils from '../utils/storage';
 import { useVSCodeContext } from '../utils/llama-vscode';
 import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
-
+import {
+  ArrowUpIcon,
+  StopIcon,
+  PaperClipIcon,
+} from '@heroicons/react/24/solid';
+import {
+  ChatExtraContextApi,
+  useChatExtraContext,
+} from './useChatExtraContext.tsx';
+import Dropzone from 'react-dropzone';
+import ChatInputExtraContextItem from './ChatInputExtraContextItem.tsx';
 /**
 * A message display is a message node with additional information for rendering.
 * For example, siblings of the message node are stored as their last node (aka leaf node).
@@ -104,9 +114,10 @@ export default function ChatScreen() {

  const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());

-  const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
+  const extraContext = useChatExtraContext();
+  useVSCodeContext(textarea, extraContext);
+  //const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
  // TODO: improve this when we have "upload file" feature
-  const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;

  // keep track of leaf node for rendering
  const [currNodeId, setCurrNodeId] = useState<number>(-1);
@@ -147,7 +158,7 @@ export default function ChatScreen() {
        currConvId,
        lastMsgNodeId,
        lastInpMsg,
-        currExtra,
+        extraContext.items,
        onChunk
      ))
    ) {
@@ -155,7 +166,7 @@ export default function ChatScreen() {
      textarea.setValue(lastInpMsg);
    }
    // OK
-    clearExtraContext();
+    extraContext.clearItems();
  };

  const handleEditMessage = async (msg: Message, content: string) => {
@@ -282,42 +293,14 @@ export default function ChatScreen() {
          })}        
        </div>

-        {/* chat input */}
-        <div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
-          <textarea
-            // Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
-            // Large screens (lg:): Disable manual resize, apply max-height for autosize limit
-            className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
-            placeholder="Type a message (Shift+Enter to add a new line)"
-            ref={textarea.ref}
-            onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
-            onKeyDown={(e) => {
-              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
-              if (e.key === 'Enter' && !e.shiftKey) {
-                e.preventDefault();
-                sendNewMessage();
-              }
-            }}
-            id="msg-input"
-            dir="auto"
-            // Set a base height of 2 rows for mobile views
-            // On lg+ screens, the hook will calculate and set the initial height anyway
-            rows={2}
-          ></textarea>
-
-          {isGenerating(currConvId ?? '') ? (
-            <button
-              className="btn btn-neutral ml-2"
-              onClick={() => stopGenerating(currConvId ?? '')}
-            >
-              Stop
-            </button>
-          ) : (
-            <button className="btn btn-primary ml-2" onClick={sendNewMessage}>
-              Send
-            </button>
-          )}
-        </div>
+{/* chat input */}
+        <ChatInput
+          textarea={textarea}
+          extraContext={extraContext}
+          onSend={sendNewMessage}
+          onStop={() => stopGenerating(currConvId ?? '')}
+          isGenerating={isGenerating(currConvId ?? '')}
+        />
      </div>
      <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
        {canvasData?.type === CanvasType.PY_INTERPRETER && (
@@ -327,3 +310,183 @@ export default function ChatScreen() {
    </div>
  );
 }
+
+// function ServerInfo() {
+//   const { serverProps } = useAppContext();
+//   const modalities = [];
+//   if (serverProps?.modalities?.audio) {
+//     modalities.push('audio');
+//   }
+//   if (serverProps?.modalities?.vision) {
+//     modalities.push('vision');
+//   }
+//   return (
+//     <div
+//       className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
+//       tabIndex={0}
+//       aria-description="Server information"
+//     >
+//       <div className="card-body">
+//         <b>Server Info</b>
+//         <p>
+//           <b>Model</b>: {serverProps?.model_path?.split(/(\\|\/)/).pop()}
+//           <br />
+//           {modalities.length > 0 ? (
+//             <>
+//               <b>Supported modalities:</b> {modalities.join(', ')}
+//             </>
+//           ) : (
+//             ''
+//           )}
+//         </p>
+//       </div>
+//     </div>
+//   );
+// }
+
+function ChatInput({
+  textarea,
+  extraContext,
+  onSend,
+  onStop,
+  isGenerating,
+}: {
+  textarea: ChatTextareaApi;
+  extraContext: ChatExtraContextApi;
+  onSend: () => void;
+  onStop: () => void;
+  isGenerating: boolean;
+}) {
+  const { config } = useAppContext();
+  const [isDrag, setIsDrag] = useState(false);
+
+  return (
+    <div
+      role="group"
+      aria-label="Chat input"
+      className={classNames({
+        'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
+        'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
+      })}
+    >
+      <Dropzone
+        noClick
+        onDrop={(files: File[]) => {
+          setIsDrag(false);
+          extraContext.onFileAdded(files);
+        }}
+        onDragEnter={() => setIsDrag(true)}
+        onDragLeave={() => setIsDrag(false)}
+        multiple={true}
+      >
+        {({ getRootProps, getInputProps }) => (
+          <div
+            className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
+            // when a file is pasted to the input, we handle it here
+            // if a text is pasted, and if it is long text, we will convert it to a file
+            onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
+              const text = e.clipboardData.getData('text/plain');
+              if (
+                text.length > 0 &&
+                config.pasteLongTextToFileLen > 0 &&
+                text.length > config.pasteLongTextToFileLen
+              ) {
+                // if the text is too long, we will convert it to a file
+                extraContext.addItems([
+                  {
+                    type: 'context',
+                    name: 'Pasted Content',
+                    content: text,
+                  },
+                ]);
+                e.preventDefault();
+                return;
+              }
+
+              // if a file is pasted, we will handle it here
+              const files = Array.from(e.clipboardData.items)
+                .filter((item) => item.kind === 'file')
+                .map((item) => item.getAsFile())
+                .filter((file) => file !== null);
+
+              if (files.length > 0) {
+                e.preventDefault();
+                extraContext.onFileAdded(files);
+              }
+            }}
+            {...getRootProps()}
+          >
+            {!isGenerating && (
+              <ChatInputExtraContextItem
+                items={extraContext.items}
+                removeItem={extraContext.removeItem}
+              />
+            )}
+
+            <div className="flex flex-row w-full">
+              <textarea
+                // Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
+                // Large screens (lg:): Disable manual resize, apply max-height for autosize limit
+                className="text-md outline-none border-none w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
+                placeholder="Type a message..."
+                ref={textarea.ref}
+                onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
+                onKeyDown={(e) => {
+                  if (e.nativeEvent.isComposing || e.keyCode === 229) return;
+                  if (e.key === 'Enter' && !e.shiftKey) {
+                    e.preventDefault();
+                    onSend();
+                  }
+                }}
+                id="msg-input"
+                dir="auto"
+                // Set a base height of 2 rows for mobile views
+                // On lg+ screens, the hook will calculate and set the initial height anyway
+                rows={2}
+              ></textarea>
+
+              {/* buttons area */}
+              <div className="flex flex-row gap-2 ml-2">
+                <label
+                  htmlFor="file-upload"
+                  className={classNames({
+                    'btn w-8 h-8 p-0 rounded-full': true,
+                    'btn-disabled': isGenerating,
+                  })}
+                  aria-label="Upload file"
+                  tabIndex={0}
+                  role="button"
+                >
+                  <PaperClipIcon className="h-5 w-5" />
+                </label>
+                <input
+                  id="file-upload"
+                  type="file"
+                  disabled={isGenerating}
+                  {...getInputProps()}
+                  hidden
+                />
+                {isGenerating ? (
+                  <button
+                    className="btn btn-neutral w-8 h-8 p-0 rounded-full"
+                    onClick={onStop}
+                  >
+                    <StopIcon className="h-5 w-5" />
+                  </button>
+                ) : (
+                  <button
+                    className="btn btn-primary w-8 h-8 p-0 rounded-full"
+                    onClick={onSend}
+                    aria-label="Send message"
+                  >
+                    <ArrowUpIcon className="h-5 w-5" />
+                  </button>
+                )}
+              </div>
+            </div>
+          </div>
+        )}
+      </Dropzone>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@@ -12,6 +12,7 @@ import {
  ArrowDownTrayIcon,
  PencilIcon,
  TrashIcon,
+  MoonIcon,
 } from '@heroicons/react/24/outline';

 export default function Header() {
@@ -204,16 +205,7 @@ export default function Header() {
        <div className="tooltip tooltip-bottom" data-tip="Themes">
          <div className="dropdown dropdown-end dropdown-bottom">
            <div tabIndex={0} role="button" className="btn m-1">
-              <svg
-                xmlns="http://www.w3.org/2000/svg"
-                width="16"
-                height="16"
-                fill="currentColor"
-                className="bi bi-palette2"
-                viewBox="0 0 16 16"
-              >
-                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
-              </svg>
+              <MoonIcon className="w-5 h-5" />
            </div>
            <ul
              tabIndex={0}
--- a/examples/server/webui/src/components/MarkdownDisplay.tsx
+++ b/examples/server/webui/src/components/MarkdownDisplay.tsx
@@ -11,6 +11,7 @@ import { ElementContent, Root } from 'hast';
 import { visit } from 'unist-util-visit';
 import { useAppContext } from '../utils/app.context';
 import { CanvasType } from '../utils/types';
+import { DocumentDuplicateIcon, PlayIcon } from '@heroicons/react/24/outline';

 export default function MarkdownDisplay({
  content,
@@ -109,7 +110,8 @@ export const CopyButton = ({
      }}
      onMouseLeave={() => setCopied(false)}
    >
-      {copied ? 'Copied!' : '📋 Copy'}
+      <DocumentDuplicateIcon className="h-4 w-4" />
+      {copied ? 'Copied!' : 'Copy'}
    </button>
  );
 };
@@ -133,7 +135,8 @@ export const RunPyCodeButton = ({
          })
        }
      >
-        ▶️ Run
+        <PlayIcon className="h-4 w-4" />
+        {"Run"}
      </button>
    </>
  );
--- a/examples/server/webui/src/components/SettingDialog.tsx
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@@ -275,6 +275,16 @@ const SETTING_SECTIONS = (
            key,
          }) as SettingFieldInput
      ),
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Paste length to file',
+        key: 'pasteLongTextToFileLen',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Parse PDF as image instead of text',
+        key: 'pdfAsImage',
+      },      
    ],
  },
  {
--- a/examples/server/webui/src/components/useChatExtraContext.tsx
+++ b/examples/server/webui/src/components/useChatExtraContext.tsx
@@ -0,0 +1,371 @@
+import { useState } from 'react';
+import { MessageExtra } from '../utils/types';
+import toast from 'react-hot-toast';
+import { useAppContext } from '../utils/app.context';
+import * as pdfjs from 'pdfjs-dist';
+import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
+import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
+
+pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
+
+// This file handles uploading extra context items (a.k.a files)
+// It allows processing these kinds of files:
+// - image files (converted to base64)
+// - audio files (converted to base64)
+// - text files (including code files)
+// - pdf (converted to text)
+
+// Interface describing the API returned by the hook
+export interface ChatExtraContextApi {
+  items?: MessageExtra[]; // undefined if empty, similar to Message['extra']
+  addItems: (items: MessageExtra[]) => void;
+  removeItem: (idx: number) => void;
+  clearItems: () => void;
+  onFileAdded: (files: File[]) => void; // used by "upload" button
+}
+
+export function useChatExtraContext(): ChatExtraContextApi {
+  const { serverProps, config } = useAppContext();
+  const [items, setItems] = useState<MessageExtra[]>([]);
+
+  const addItems = (newItems: MessageExtra[]) => {
+    setItems((prev) => [...prev, ...newItems]);
+  };
+
+  const removeItem = (idx: number) => {
+    setItems((prev) => prev.filter((_, i) => i !== idx));
+  };
+
+  const clearItems = () => {
+    setItems([]);
+  };
+
+  const isSupportVision = serverProps?.modalities?.vision;
+
+  const onFileAdded = async (files: File[]) => {
+    try {
+      for (const file of files) {
+        const mimeType = file.type;
+
+        // this limit is only to prevent accidental uploads of huge files
+        // it can potentially crashes the browser because we read the file as base64
+        if (file.size > 500 * 1024 * 1024) {
+          toast.error('File is too large. Maximum size is 500MB.');
+          break;
+        }
+
+        if (mimeType.startsWith('image/')) {
+          if (!isSupportVision) {
+            toast.error('Multimodal is not supported by this server or model.');
+            break;
+          }
+
+          let base64Url = await getFileAsBase64(file);
+          if (mimeType === 'image/svg+xml') {
+            // Convert SVG to PNG
+            base64Url = await svgBase64UrlToPngDataURL(base64Url);
+          }
+          addItems([
+            {
+              type: 'imageFile',
+              name: file.name,
+              base64Url,
+            },
+          ]);
+        } else if (mimeType.startsWith('video/')) {
+          toast.error('Video files are not supported yet.');
+          break;
+        } else if (mimeType.startsWith('audio/')) {
+          if (!/mpeg|wav/.test(mimeType)) {
+            toast.error('Only mp3 and wav audio files are supported.');
+            break;
+          }
+
+          // plain base64, not a data URL
+          const base64Data = await getFileAsBase64(file, false);
+          addItems([
+            {
+              type: 'audioFile',
+              name: file.name,
+              mimeType,
+              base64Data,
+            },
+          ]);
+        } else if (mimeType.startsWith('application/pdf')) {
+          if (config.pdfAsImage && !isSupportVision) {
+            toast(
+              'Multimodal is not supported, PDF will be converted to text instead of image.'
+            );
+            break;
+          }
+
+          if (config.pdfAsImage && isSupportVision) {
+            // Convert PDF to images
+            const base64Urls = await convertPDFToImage(file);
+            addItems(
+              base64Urls.map((base64Url) => ({
+                type: 'imageFile',
+                name: file.name,
+                base64Url,
+              }))
+            );
+          } else {
+            // Convert PDF to text
+            const content = await convertPDFToText(file);
+            addItems([
+              {
+                type: 'textFile',
+                name: file.name,
+                content,
+              },
+            ]);
+            if (isSupportVision) {
+              toast.success(
+                'PDF file converted to text. You can also convert it to image, see in Settings.'
+              );
+            }
+          }
+          break;
+        } else {
+          // Because there can be many text file types (like code file), we will not check the mime type
+          // and will just check if the file is not binary.
+          const reader = new FileReader();
+          reader.onload = (event) => {
+            if (event.target?.result) {
+              const content = event.target.result as string;
+              if (!isLikelyNotBinary(content)) {
+                toast.error('File is binary. Please upload a text file.');
+                return;
+              }
+              addItems([
+                {
+                  type: 'textFile',
+                  name: file.name,
+                  content,
+                },
+              ]);
+            }
+          };
+          reader.readAsText(file);
+        }
+      }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const errorMessage = `Error processing file: ${message}`;
+      toast.error(errorMessage);
+    }
+  };
+
+  return {
+    items: items.length > 0 ? items : undefined,
+    addItems,
+    removeItem,
+    clearItems,
+    onFileAdded,
+  };
+}
+
+async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = (event) => {
+      if (event.target?.result) {
+        let result = event.target.result as string;
+        if (!outputUrl) {
+          // remove base64 url prefix and correct characters
+          result = result.substring(result.indexOf(',') + 1);
+        }
+        resolve(result);
+      } else {
+        reject(new Error('Failed to read file.'));
+      }
+    };
+    reader.readAsDataURL(file);
+  });
+}
+
+async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = (event) => {
+      if (event.target?.result) {
+        resolve(event.target.result as ArrayBuffer);
+      } else {
+        reject(new Error('Failed to read file.'));
+      }
+    };
+    reader.readAsArrayBuffer(file);
+  });
+}
+
+async function convertPDFToText(file: File): Promise<string> {
+  const buffer = await getFileAsBuffer(file);
+  const pdf = await pdfjs.getDocument(buffer).promise;
+  const numPages = pdf.numPages;
+  const textContentPromises: Promise<TextContent>[] = [];
+  for (let i = 1; i <= numPages; i++) {
+    textContentPromises.push(
+      pdf.getPage(i).then((page) => page.getTextContent())
+    );
+  }
+  const textContents = await Promise.all(textContentPromises);
+  const textItems = textContents.flatMap((textContent: TextContent) =>
+    textContent.items.map((item) => (item as TextItem).str ?? '')
+  );
+  return textItems.join('\n');
+}
+
+// returns list of base64 images
+async function convertPDFToImage(file: File): Promise<string[]> {
+  const buffer = await getFileAsBuffer(file);
+  const doc = await pdfjs.getDocument(buffer).promise;
+  const pages: Promise<string>[] = [];
+
+  for (let i = 1; i <= doc.numPages; i++) {
+    const page = await doc.getPage(i);
+    const viewport = page.getViewport({ scale: 1.5 });
+    const canvas = document.createElement('canvas');
+    const ctx = canvas.getContext('2d');
+    canvas.width = viewport.width;
+    canvas.height = viewport.height;
+    if (!ctx) {
+      throw new Error('Failed to get 2D context from canvas');
+    }
+    const task = page.render({ canvasContext: ctx, viewport: viewport });
+    pages.push(
+      task.promise.then(() => {
+        return canvas.toDataURL();
+      })
+    );
+  }
+
+  return await Promise.all(pages);
+}
+
+// WARN: vibe code below
+// This code is a heuristic to determine if a string is likely not binary.
+// It is necessary because input file can have various mime types which we don't have time to investigate.
+// For example, a python file can be text/plain, application/x-python, etc.
+function isLikelyNotBinary(str: string): boolean {
+  const options = {
+    prefixLength: 1024 * 10, // Check the first 10KB of the string
+    suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+    maxAbsoluteNullBytes: 2,
+  };
+
+  if (!str) {
+    return true; // Empty string is considered "not binary" or trivially text.
+  }
+
+  const sampleLength = Math.min(str.length, options.prefixLength);
+  if (sampleLength === 0) {
+    return true; // Effectively an empty string after considering prefixLength.
+  }
+
+  let suspiciousCharCount = 0;
+  let nullByteCount = 0;
+
+  for (let i = 0; i < sampleLength; i++) {
+    const charCode = str.charCodeAt(i);
+
+    // 1. Check for Unicode Replacement Character (U+FFFD)
+    // This is a strong indicator if the string was created from decoding bytes as UTF-8.
+    if (charCode === 0xfffd) {
+      suspiciousCharCount++;
+      continue;
+    }
+
+    // 2. Check for Null Bytes (U+0000)
+    if (charCode === 0x0000) {
+      nullByteCount++;
+      // We also count nulls towards the general suspicious character count,
+      // as they are less common in typical text files.
+      suspiciousCharCount++;
+      continue;
+    }
+
+    // 3. Check for C0 Control Characters (U+0001 to U+001F)
+    // Exclude common text control characters: TAB (9), LF (10), CR (13).
+    // We can also be a bit lenient with BEL (7) and BS (8) which sometimes appear in logs.
+    if (charCode < 32) {
+      if (
+        charCode !== 9 && // TAB
+        charCode !== 10 && // LF
+        charCode !== 13 && // CR
+        charCode !== 7 && // BEL (Bell) - sometimes in logs
+        charCode !== 8 // BS (Backspace) - less common, but possible
+      ) {
+        suspiciousCharCount++;
+      }
+    }
+    // Characters from 32 (space) up to 126 (~) are printable ASCII.
+    // Characters 127 (DEL) is a control character.
+    // Characters >= 128 are extended ASCII / multi-byte Unicode.
+    // If they resulted in U+FFFD, we caught it. Otherwise, they are valid
+    // (though perhaps unusual) Unicode characters from JS's perspective.
+    // The main concern is if those higher characters came from misinterpreting
+    // a single-byte encoding as UTF-8, which again, U+FFFD would usually flag.
+  }
+
+  // Check absolute null byte count
+  if (nullByteCount > options.maxAbsoluteNullBytes) {
+    return false; // Too many null bytes is a strong binary indicator
+  }
+
+  // Check ratio of suspicious characters
+  const ratio = suspiciousCharCount / sampleLength;
+  return ratio <= options.suspiciousCharThresholdRatio;
+}
+
+// WARN: vibe code below
+// Converts a Base64URL encoded SVG string to a PNG Data URL using browser Canvas API.
+function svgBase64UrlToPngDataURL(base64UrlSvg: string): Promise<string> {
+  const backgroundColor = 'white'; // Default background color for PNG
+
+  return new Promise((resolve, reject) => {
+    try {
+      const img = new Image();
+
+      img.onload = () => {
+        const canvas = document.createElement('canvas');
+        const ctx = canvas.getContext('2d');
+
+        if (!ctx) {
+          reject(new Error('Failed to get 2D canvas context.'));
+          return;
+        }
+
+        // Use provided dimensions or SVG's natural dimensions, with fallbacks
+        // Fallbacks (e.g., 300x300) are for SVGs without explicit width/height
+        // or when naturalWidth/Height might be 0 before full processing.
+        const targetWidth = img.naturalWidth || 300;
+        const targetHeight = img.naturalHeight || 300;
+
+        canvas.width = targetWidth;
+        canvas.height = targetHeight;
+
+        if (backgroundColor) {
+          ctx.fillStyle = backgroundColor;
+          ctx.fillRect(0, 0, canvas.width, canvas.height);
+        }
+
+        ctx.drawImage(img, 0, 0, targetWidth, targetHeight);
+        resolve(canvas.toDataURL('image/png'));
+      };
+
+      img.onerror = () => {
+        reject(
+          new Error('Failed to load SVG image. Ensure the SVG data is valid.')
+        );
+      };
+
+      // Load SVG string into an Image element
+      img.src = base64UrlSvg;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const errorMessage = `Error converting SVG to PNG: ${message}`;
+      toast.error(errorMessage);
+      reject(new Error(errorMessage));
+    }
+  });
+}
--- a/examples/server/webui/src/components/useChatTextarea.ts
+++ b/examples/server/webui/src/components/useChatTextarea.ts
@@ -37,6 +37,7 @@ export interface ChatTextareaApi {
  setValue: (value: string) => void;
  focus: () => void;
  ref: React.RefObject<HTMLTextAreaElement>;
+  refOnSubmit: React.MutableRefObject<(() => void) | null>; // Submit handler
  onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
 }

@@ -46,7 +47,7 @@ export interface ChatTextareaApi {
 export function useChatTextarea(initValue: string): ChatTextareaApi {
  const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
  const textareaRef = useRef<HTMLTextAreaElement>(null);
-
+  const onSubmitRef = useRef<(() => void) | null>(null);
  // Effect to set initial value and height on mount or when initValue changes
  useEffect(() => {
    const textarea = textareaRef.current;
@@ -91,6 +92,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
      }
    },
    ref: textareaRef,
+    refOnSubmit: onSubmitRef,
    onInput: handleInput,
  };
 }
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@@ -3,6 +3,7 @@ import {
  APIMessage,
  CanvasData,
  Conversation,
+  LlamaCppServerProps,
  Message,
  PendingMessage,
  ViewingChat,
@@ -12,6 +13,7 @@ import {
  filterThoughtFromMsgs,
  normalizeMsgsForAPI,
  getSSEStreamAsync,
+  getServerProps
 } from './misc';
 import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
 import { matchPath, useLocation, useNavigate } from 'react-router';
@@ -54,6 +56,10 @@ interface AppContextValue {
  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
  showSettings: boolean;
  setShowSettings: (show: boolean) => void;
+
+    // props
+  serverProps: LlamaCppServerProps | null;
+
 }

 // this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -82,6 +88,9 @@ export const AppContextProvider = ({
  const params = matchPath('/chat/:convId', pathname);
  const convId = params?.params?.convId;

+  const [serverProps, setServerProps] = useState<LlamaCppServerProps | null>(
+    null
+  );
  const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
  const [pendingMessages, setPendingMessages] = useState<
    Record<Conversation['id'], PendingMessage>
@@ -93,6 +102,20 @@ export const AppContextProvider = ({
  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
  const [showSettings, setShowSettings] = useState(false);

+  // get server props
+  useEffect(() => {
+    getServerProps(BASE_URL, config.apiKey)
+      .then((props) => {
+        console.debug('Server props:', props);
+        setServerProps(props);
+      })
+      .catch((err) => {
+        console.error(err);
+        toast.error('Failed to fetch server props');
+      });
+    // eslint-disable-next-line
+  }, []);
+
  // handle change when the convId from URL is changed
  useEffect(() => {
    // also reset the canvas data
@@ -469,6 +492,7 @@ export const AppContextProvider = ({
        saveConfig,
        showSettings,
        setShowSettings,
+        serverProps,
      }}
    >
      {children}
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,6 +1,6 @@
-import { useEffect, useState } from 'react';
-import { MessageExtraContext } from './types';
+import { useEffect } from 'react';
 import { ChatTextareaApi } from '../components/useChatTextarea.ts';
+import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx';

 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,11 +15,10 @@ interface SetTextEvData {
 * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
 */

-export const useVSCodeContext = (textarea: ChatTextareaApi) => {
-  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
-    null
-  );
-
+export const useVSCodeContext = (
+  textarea: ChatTextareaApi,
+  extraContext: ChatExtraContextApi
+) => {
  // Accept setText message from a parent window and set inputMsg and extraContext
  useEffect(() => {
    const handleMessage = (event: MessageEvent) => {
@@ -27,18 +26,25 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
        const data: SetTextEvData = event.data;
        textarea.setValue(data?.text);
        if (data?.context && data.context.length > 0) {
-          setExtraContext({
-            type: 'context',
-            content: data.context,
-          });
+          extraContext.clearItems();
+          extraContext.addItems([
+            {
+              type: 'context',
+              name: 'Extra context',
+              content: data.context,
+            },
+          ]);
        }
        textarea.focus();
+        setTimeout(() => {
+          textarea.refOnSubmit.current?.();
+        }, 10); // wait for setExtraContext to finish
      }
    };

    window.addEventListener('message', handleMessage);
    return () => window.removeEventListener('message', handleMessage);
-  }, [textarea]);
+  }, [textarea, extraContext]);

  // Add a keydown listener that sends the "escapePressed" message to the parent window
  useEffect(() => {
@@ -52,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
    return () => window.removeEventListener('keydown', handleKeyDown);
  }, []);

-  return {
-    extraContext,
-    // call once the user message is sent, to clear the extra context
-    clearExtraContext: () => setExtraContext(null),
-  };
+  return {};
 };
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@@ -1,6 +1,6 @@
 // @ts-expect-error this package does not have typing
 import TextLineStream from 'textlinestream';
-import { APIMessage, Message } from './types';
+import { APIMessage, Message, LlamaCppServerProps, APIMessageContentPart } from './types';

 // ponyfill for missing ReadableStream asyncIterator on Safari
 import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
@@ -57,21 +57,55 @@ export const copyStr = (textToCopy: string) => {
 */
 export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
  return messages.map((msg) => {
-    let newContent = '';
+    if (msg.role !== 'user' || !msg.extra) {
+      return {
+        role: msg.role,
+        content: msg.content,
+      } as APIMessage;
+    }
+
+    // extra content first, then user text message in the end
+    // this allow re-using the same cache prefix for long context
+    const contentArr: APIMessageContentPart[] = [];

    for (const extra of msg.extra ?? []) {
      if (extra.type === 'context') {
-		if (extra.content!='') {
-            newContent += `${extra.content}\n\n`;
-		}
+        contentArr.push({
+          type: 'text',
+          text: extra.content,
+        });
+      } else if (extra.type === 'textFile') {
+        contentArr.push({
+          type: 'text',
+          text: `File: ${extra.name}\nContent:\n\n${extra.content}`,
+        });
+      } else if (extra.type === 'imageFile') {
+        contentArr.push({
+          type: 'image_url',
+          image_url: { url: extra.base64Url },
+        });
+      } else if (extra.type === 'audioFile') {
+        contentArr.push({
+          type: 'input_audio',
+          input_audio: {
+            data: extra.base64Data,
+            format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
+          },
+        });
+      } else {
+        throw new Error('Unknown extra type');
      }
    }

-    newContent += msg.content;
+    // add user message to the end
+    contentArr.push({
+      type: 'text',
+      text: msg.content,
+    });

    return {
      role: msg.role,
-      content: newContent,
+      content: contentArr,
    };
  }) as APIMessage[];
 }
@@ -137,3 +171,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => {
  });
  window.history.replaceState({}, '', url.toString());
 };
+
+export const getServerProps = async (
+  baseUrl: string,
+  apiKey?: string
+): Promise<LlamaCppServerProps> => {
+  try {
+    const response = await fetch(`${baseUrl}/props`, {
+      headers: {
+        'Content-Type': 'application/json',
+        ...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
+      },
+    });
+    if (!response.ok) {
+      throw new Error('Failed to fetch server props');
+    }
+    const data = await response.json();
+    return data as LlamaCppServerProps;
+  } catch (error) {
+    console.error('Error fetching server props:', error);
+    throw error;
+  }
+};
--- a/examples/server/webui/src/utils/types.ts
+++ b/examples/server/webui/src/utils/types.ts
@@ -48,7 +48,11 @@ export interface Message {
  children: Message['id'][];
 }

-type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future
+export type MessageExtra =
+  | MessageExtraTextFile
+  | MessageExtraImageFile
+  | MessageExtraAudioFile
+  | MessageExtraContext;

 export interface MessageExtraTextFile {
  type: 'textFile';
@@ -56,12 +60,43 @@ export interface MessageExtraTextFile {
  content: string;
 }

+export interface MessageExtraImageFile {
+  type: 'imageFile';
+  name: string;
+  base64Url: string;
+}
+
+export interface MessageExtraAudioFile {
+  type: 'audioFile';
+  name: string;
+  base64Data: string;
+  mimeType: string;
+}
+
 export interface MessageExtraContext {
  type: 'context';
+  name: string;
  content: string;
 }

-export type APIMessage = Pick<Message, 'role' | 'content'>;
+export type APIMessageContentPart =
+  | {
+      type: 'text';
+      text: string;
+    }
+  | {
+      type: 'image_url';
+      image_url: { url: string };
+    }
+  | {
+      type: 'input_audio';
+      input_audio: { data: string; format: 'wav' | 'mp3' };
+    };
+
+export type APIMessage = {
+  role: Message['role'];
+  content: string | APIMessageContentPart[];
+};

 export interface Conversation {
  id: string; // format: `conv-{timestamp}`
@@ -96,4 +131,15 @@ export interface SettingsPreset {
  name: string;
  createdAt: number; // timestamp from Date.now()
  config: Record<string, string | number | boolean>; // partial CONFIG_DEFAULT
+}
+
+// a non-complete list of props, only contains the ones we need
+export interface LlamaCppServerProps {
+  model_path: string;
+  n_ctx: number;
+  modalities?: {
+    vision: boolean;
+    audio: boolean;
+  };
+  // TODO: support params
 }
--- a/examples/server/webui/vite.config.ts
+++ b/examples/server/webui/vite.config.ts
@@ -7,7 +7,7 @@ import zlib from 'node:zlib';

 /* eslint-disable */

-const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
+const MAX_BUNDLE_SIZE = 2 * 1024 * 1024; // only increase when absolutely necessary

 const GUIDE_FOR_FRONTEND = `
 <!--
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -99,6 +99,18 @@ ggml_cgraph * llm_build_context::build_k_shift() {

    GGML_ASSERT(kv_self.size == n_ctx);

+    const auto & rope_type_shift = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+        // @ngxson : this is a workaround
+        // for M-RoPE, we want to rotate the whole vector when doing KV shift
+        // a normal RoPE should work, we just need to use the correct ordering
+        // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+        ? LLAMA_ROPE_TYPE_NEOX
+        : hparams.rope_type;
+
+    const float yarn_attn_factor_shift = model.arch == LLM_ARCH_DEEPSEEK2
+        ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+        : cparams.yarn_attn_factor;
+
    lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
    cb(lctx.inp_K_shift, "K_shift", -1);
    ggml_set_input(lctx.inp_K_shift);
@@ -127,15 +139,15 @@ ggml_cgraph * llm_build_context::build_k_shift() {
                }
            }
            tmp = ggml_rope_ext_inplace(ctx0, tmp,
-                    lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+                    lctx.inp_K_shift, rope_factors, n_rot, rope_type_shift, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, yarn_attn_factor_shift, beta_fast, beta_slow);
            cb(tmp, "K_shifted_f32", il);
            tmp = ggml_cpy(ctx0, tmp, k);
        } else {
            // we rotate only the first n_rot dimensions
            tmp = ggml_rope_ext_inplace(ctx0, k,
-                    lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+                    lctx.inp_K_shift, rope_factors, n_rot, rope_type_shift, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, yarn_attn_factor_shift, beta_fast, beta_slow);
        }
        cb(tmp, "K_shifted", il);
        ggml_build_forward_expand(gf, tmp);