Add vision support in llama-server (#901)

* server: add support for vision model
webui: add support for vision model

* server : remove hack for extra parallel slot#10187

* llama : fix KV shift for qwen2vl #13870

* add no-context-shift parameter

---------

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2025-11-05 08:43:46 +00:00
committed by GitHub
parent 92607d44c4
commit 7978f04996
26 changed files with 2456 additions and 729 deletions

View File

@@ -57,8 +57,6 @@ add_library(${TARGET} STATIC
chat-parser.cpp
chat-parser.h
common.cpp
chat.h
chat.cpp
sampling.h
sampling.cpp
console.h

View File

@@ -270,6 +270,14 @@ static std::string parse_device_list(const std::string& value) {
return value;
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
if (!url.empty()) {
throw std::runtime_error("error: built without CURL, cannot download file from the internet");
}
return {};
}
//
// CLI argument parsing
//
@@ -1727,6 +1735,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_junk = std::stoi(argv[i]);
return true;
}
if (arg == "--no-context-shift") {
CHECK_ARG
params.ctx_shift = false;
return true;
}
if (arg == "--pos") {
CHECK_ARG
params.i_pos = std::stoi(argv[i]);
@@ -2060,7 +2073,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
@@ -3311,6 +3324,29 @@ std::vector<llama_token> llama_tokenize(
return result;
}
std::vector<llama_token> llama_tokenize(
const struct llama_vocab* vocab,
const std::string& text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens == std::numeric_limits<int32_t>::min()) {
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
}
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_vocab_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
}
else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
@@ -3343,7 +3379,7 @@ std::string llama_token_to_piece(const struct llama_model* model, llama_token to
return piece;
}
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string llama_detokenize(const llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -3359,6 +3395,7 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
return text;
}
bool llama_should_add_bos_token(const llama_model * model) {
const int add_bos = llama_add_bos_token(model);

View File

@@ -53,6 +53,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
struct llama_lora_adapter * adapter;
};
using llama_tokens = std::vector<llama_token>;
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
@@ -237,7 +239,7 @@ struct gpt_params {
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool ctx_shift = true;
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@@ -371,6 +373,9 @@ struct gpt_params {
bool sweep_bench_output_jsonl = false;
};
void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_parse_from_env(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
@@ -381,6 +386,15 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);
struct common_remote_params {
std::vector<std::string> headers;
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
//
// String utils
//
@@ -497,6 +511,12 @@ std::vector<llama_token> llama_tokenize(
bool add_special,
bool parse_special = false);
std::vector<llama_token> llama_tokenize(
const struct llama_vocab* vocab,
const std::string& text,
bool add_special,
bool parse_special = false);
// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
@@ -513,70 +533,16 @@ std::string llama_token_to_piece(
// should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens
std::string llama_detokenize(
llama_context * ctx,
const llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// Chat template utils
//
//struct common_tool_call {
// std::string name;
// std::string arguments;
// std::string id;
//};
//
//// same with llama_chat_message, but uses std::string
//struct common_chat_msg {
// std::string role;
// std::string content;
// std::vector<common_tool_call> tool_calls;
// std::string reasoning_content = "";
//};
//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
//
//namespace minja {
// class chat_template;
//}
//
//typedef minja::chat_template common_chat_template;
//
//struct common_chat_templates {
// bool has_explicit_template; // Model had builtin template or template overridde was specified.
// std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
// std::unique_ptr<common_chat_template> template_tool_use;
//};
//
//
//// CPP wrapper for llama_chat_apply_template
//// If the built-in template is not supported, we default to chatml
//// If the custom "tmpl" is not supported, we throw an error
//std::string llama_chat_apply_template(
// const struct llama_model* model,
// const common_chat_template& tmpl,
// const std::vector< common_chat_msg>& chat,
// bool add_ass,
// bool use_jinja);
//
//// Format single message, while taking into account the position of that message in chat history
//std::string llama_chat_format_single(const struct llama_model* model,
// const common_chat_template& tmpl,
// const std::vector< common_chat_msg>& past_msg,
// const common_chat_msg& new_msg,
// bool add_ass,
// bool use_jinja);
//
//// Returns an example of formatted chat
//std::string llama_chat_format_example(const struct llama_model* model,
// const common_chat_template& tmpl, bool use_jinja);
//
//common_chat_templates llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);
//

View File

@@ -3331,7 +3331,7 @@ struct image_manipulation {
dst.buf.resize(3 * target_width * target_height);
float Cc;
float C[5];
float C[5] = {};
float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj;
int x, y;

View File

@@ -70,6 +70,9 @@ endif()
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED)
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -6,6 +6,9 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
#include "base64.hpp"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "chat.h"
#include <string>
#include <vector>
@@ -51,6 +54,8 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
using raw_buffer = std::vector<uint8_t>;
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
template <typename T>
@@ -469,8 +474,9 @@ struct oaicompat_parser_options {
// used by /chat/completions endpoint
static json oaicompat_chat_params_parse(
const struct llama_model* model,
const json& body, /* openai api json semantics */
const oaicompat_parser_options& opt)
json& body, /* openai api json semantics */
const oaicompat_parser_options& opt,
std::vector<raw_buffer>& out_files)
{
json llama_params;
@@ -480,20 +486,6 @@ static json oaicompat_chat_params_parse(
auto stream = json_value(body, "stream", false);
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
/* if (tools.is_array() && !tools.empty()) {
if (stream) {
throw std::runtime_error("Cannot use tools with stream");
}
if (!use_jinja) {
throw std::runtime_error("tools param requires --jinja flag");
}
}
if (!use_jinja) {
if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
throw std::runtime_error("Unsupported param: tool_choice");
}
}*/
if (!opt.use_jinja) {
if (has_tools) {
throw std::runtime_error("tools param requires --jinja flag");
@@ -531,8 +523,120 @@ static json oaicompat_chat_params_parse(
json_schema = json_value(json_schema, "schema", json::object());
}
}
// get input files
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
}
json& messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
}
for (auto& msg : messages) {
std::string role = json_value(msg, "role", std::string());
if (role != "assistant" && !msg.contains("content")) {
throw std::runtime_error("All non-assistant messages must contain 'content'");
}
if (role == "assistant") {
if (!msg.contains("content") && !msg.contains("tool_calls")) {
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
}
if (!msg.contains("content")) {
continue; // avoid errors with no content
}
}
json& content = msg.at("content");
if (content.is_string() || content.is_null()) {
continue;
}
if (!content.is_array()) {
throw std::runtime_error("Expected 'content' to be a string or an array");
}
for (auto& p : content) {
std::string type = json_value(p, "type", std::string());
if (type == "image_url") {
if (!opt.allow_image) {
throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json image_url = json_value(p, "image_url", json::object());
std::string url = json_value(image_url, "url", std::string());
if (string_starts_with(url, "http")) {
// download remote image
// TODO @ngxson : maybe make these params configurable
common_remote_params params;
params.headers.push_back("User-Agent: ik_llama.cpp/");
params.max_size = 1024 * 1024 * 10; // 10MB
params.timeout = 10; // seconds
LOG_INFO("downloading image from '%s'\n", url.c_str());
auto res = common_remote_get_content(url, params);
if (200 <= res.first && res.first < 300) {
LOG_INFO("downloaded %ld bytes\n", res.second.size());
raw_buffer data;
data.insert(data.end(), res.second.begin(), res.second.end());
out_files.push_back(data);
}
else {
throw std::runtime_error("Failed to download image");
}
}
else {
// try to decode base64 image
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
if (parts.size() != 2) {
throw std::runtime_error("Invalid image_url.url value");
}
else if (!string_starts_with(parts[0], "data:image/")) {
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
}
else if (!string_ends_with(parts[0], "base64")) {
throw std::runtime_error("image_url.url must be base64 encoded");
}
else {
auto base64_data = parts[1];
auto decoded_data = base64_decode(base64_data);
out_files.push_back(decoded_data);
}
}
// replace this chunk with a marker
p["type"] = "text";
p["text"] = mtmd_default_marker();
p.erase("image_url");
}
else if (type == "input_audio") {
if (!opt.allow_audio) {
throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json input_audio = json_value(p, "input_audio", json::object());
std::string data = json_value(input_audio, "data", std::string());
std::string format = json_value(input_audio, "format", std::string());
// while we also support flac, we don't allow it here so we matches the OAI spec
if (format != "wav" && format != "mp3") {
throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
}
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
// replace this chunk with a marker
p["type"] = "text";
p["text"] = mtmd_default_marker();
p.erase("input_audio");
}
else if (type != "text") {
throw std::runtime_error("unsupported content[].type");
}
}
}
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
inputs.tools = common_chat_tools_parse_oaicompat(tools);
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
@@ -608,8 +712,9 @@ static json oaicompat_chat_params_parse(
llama_params["grammar"] = chat_params.grammar;
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
auto grammar_triggers = json::array();
for (const auto& trigger : chat_params.grammar_triggers) {
grammar_triggers.push_back(trigger.to_json<json>());
for (const auto & trigger : chat_params.grammar_triggers) {
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
}
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
@@ -649,6 +754,52 @@ static json oaicompat_chat_params_parse(
return llama_params;
}
//
// tokenizer and input processing utils
//
static bool json_is_array_of_numbers(const json& data) {
if (data.is_array()) {
for (const auto& e : data) {
if (!e.is_number_integer()) {
return false;
}
}
return true;
}
return false;
}
// is array having BOTH numbers & strings?
static bool json_is_array_of_mixed_numbers_strings(const json& data) {
bool seen_string = false;
bool seen_number = false;
if (data.is_array()) {
for (const auto& e : data) {
seen_string |= e.is_string();
seen_number |= e.is_number_integer();
if (seen_number && seen_string) {
return true;
}
}
}
return false;
}
// does array have any individual integers/tokens?
static bool json_is_array_and_contains_numbers(const json& data) {
if (data.is_array()) {
for (const auto& e : data) {
if (e.is_number_integer()) {
return true;
}
}
return false;
}
return false;
}
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string>& paths, const json& js) {
json result = json::object();
@@ -673,6 +824,50 @@ static json json_get_nested_values(const std::vector<std::string>& paths, const
}
/**
* this handles 2 cases:
* - only string, example: "string"
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
*/
static std::vector<llama_token> tokenize_mixed(const llama_vocab* vocab, const json& json_prompt, bool add_special, bool parse_special) {
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
if (json_prompt.is_array()) {
bool first = true;
for (const auto& p : json_prompt) {
if (p.is_string()) {
auto s = p.template get<std::string>();
std::vector<llama_token> p;
if (first) {
p = llama_tokenize(vocab, s, add_special, parse_special);
first = false;
}
else {
p = llama_tokenize(vocab, s, false, parse_special);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
else {
if (first) {
first = false;
}
prompt_tokens.push_back(p.template get<llama_token>());
}
}
}
else {
auto s = json_prompt.template get<std::string>();
prompt_tokens = llama_tokenize(vocab, s, add_special, parse_special);
}
return prompt_tokens;
}
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
return json {
{"tokens", tokens}
@@ -764,3 +959,480 @@ static token_probabilities get_token_probabilities(llama_context * ctx, int idx,
return {sampled_token_p, cur};
}
/**
* server_tokens is a helper to manage the input tokens and image for the server.
* it is made this way to simplify the logic of KV cache management.
*/
struct server_tokens {
bool has_mtmd = false;
private: // disallow accessing these members directly, risking out-of-sync
// map a **start** position in tokens to the image chunk
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
// list of tokens
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
std::vector<llama_token> tokens;
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// pos 0 1 2 3 4 5 6 7 8 9
// map_pos_to_media will contain: {5, img0}, {8, img1}
public:
server_tokens() = default;
~server_tokens() = default;
// Prevent copying
server_tokens(const server_tokens&) = delete;
server_tokens& operator=(const server_tokens&) = delete;
// Allow moving (usually implicitly generated if members are movable)
server_tokens(server_tokens&&) = default;
server_tokens& operator=(server_tokens&&) = default;
// Allow accessing elements using [] operator
llama_token operator[](size_t index) { return tokens[index]; }
const llama_token& operator[](size_t index) const { return tokens[index]; }
server_tokens(mtmd::input_chunks& mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
push_back(mtmd_chunks[i]);
}
}
server_tokens(std::vector<llama_token>& tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
llama_pos pos_next() const {
if (!has_mtmd) {
return tokens.size();
}
llama_pos res = tokens.size();
for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ++it) {
const auto& chunk = it->second;
res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
}
return res;
}
// for debugging
std::string str() const {
std::ostringstream oss;
oss << "tokens: ";
for (const auto& t : tokens) {
if (t == LLAMA_TOKEN_NULL) {
oss << "<embd> ";
}
else {
oss << t << " ";
}
}
oss << "\n";
oss << "image pos: ";
for (const auto& it : map_pos_to_media) {
oss << it.first << ", ";
}
return oss.str();
}
const mtmd::input_chunk_ptr& find_chunk(llama_pos pos) const {
auto it = map_pos_to_media.find(pos);
if (it != map_pos_to_media.end()) {
return it->second;
}
else {
throw std::runtime_error("Chunk not found");
}
}
void push_back(llama_token tok) {
if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token");
}
tokens.emplace_back(tok);
}
// will create a copy of the chunk if it contains non-text data
void push_back(const mtmd_input_chunk* chunk) {
auto type = mtmd_input_chunk_get_type(chunk);
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
GGML_ASSERT(has_mtmd);
const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
fprintf(stdout, "n_pos: %d\n", n_pos);
llama_pos start_pos = tokens.size();
for (int i = 0; i < n_pos; ++i) {
tokens.emplace_back(LLAMA_TOKEN_NULL);
}
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_media[start_pos] = std::move(new_chunk);
}
else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
for (size_t i = 0; i < n_tokens; ++i) {
push_back(text_tokens[i]);
}
}
else {
GGML_ABORT("Invalid chunk type");
}
}
// appends server tokens, updates the media map. copies media chunks.
void push_back(server_tokens& tokens) {
size_t start_pos = size();
for (size_t i = 0; i < tokens.size(); i++) {
push_back(tokens[i]);
}
if (tokens.has_mtmd) {
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
// We could also just check, but this will prevent silently dropping MTMD data.
GGML_ASSERT(has_mtmd);
for (auto it = tokens.map_pos_to_media.begin(); it != tokens.map_pos_to_media.end(); ) {
auto chunk = tokens.map_pos_to_media[it->first].get();
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_media[start_pos + it->first] = std::move(new_chunk);
}
}
}
// for compatibility with context shift and prompt truncation
void insert(const std::vector<llama_token>& inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
}
// for compatibility with context shift and prompt truncation
void resize(size_t size) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.resize(size);
}
llama_token * data() {
return tokens.data();
}
llama_tokens::iterator begin() {
return tokens.begin();
}
llama_tokens::iterator end() {
return tokens.end();
}
llama_tokens::const_iterator cbegin() {
return tokens.cbegin();
}
llama_tokens::const_iterator cend() {
return tokens.cend();
}
llama_tokens tokens_data() {
return tokens;
}
// for compatibility with speculative decoding, ctx shift, slot save/load
const std::vector<llama_token>& get_text_tokens() const {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
return tokens;
}
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens[pos] = id;
}
size_t size() const {
return tokens.size();
}
bool empty() const {
return tokens.empty();
}
void clear() {
tokens.clear();
}
void keep_first(size_t n) {
GGML_ASSERT(n <= tokens.size());
if (has_mtmd) {
if (n == tokens.size()) {
return; // nothing to do
}
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// n 1 2 3 4 5 6 7 8 9 10
// allowed to resize ^ ^
// disallowed to resize ^ ^ ^
if (n > 0) {
llama_token last_token = tokens[n - 1];
// make sure we never remove tokens in the middle of an image
if (last_token == LLAMA_TOKEN_NULL) {
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
}
}
// remove all image chunks that are not used anymore
for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
llama_pos pos = it->first;
if (pos >= (llama_pos)n) {
it = map_pos_to_media.erase(it);
}
else {
++it;
}
}
}
tokens.resize(n);
}
std::string detokenize(const llama_context* ctx, bool special) const {
llama_tokens text_tokens;
text_tokens.reserve(tokens.size());
for (const auto& t : tokens) {
if (t != LLAMA_TOKEN_NULL) {
text_tokens.push_back(t);
}
}
return llama_detokenize(ctx, text_tokens, special);
}
size_t get_common_prefix(const server_tokens& b) const {
size_t max_idx = std::min(tokens.size(), b.tokens.size());
for (size_t i = 0; i < max_idx; ++i) {
auto& ai = tokens[i];
auto& bi = b.tokens[i];
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
GGML_ASSERT(has_mtmd);
const auto& a_chunk = find_chunk(i);
const auto& b_chunk = b.find_chunk(i);
GGML_ASSERT(a_chunk && b_chunk);
std::string ai_id = mtmd_input_chunk_get_id(a_chunk.get());
std::string bi_id = mtmd_input_chunk_get_id(b_chunk.get());
size_t a_pos = mtmd_input_chunk_get_n_pos(a_chunk.get());
size_t b_pos = mtmd_input_chunk_get_n_pos(b_chunk.get());
if (ai_id == bi_id && a_pos == b_pos) {
GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
i += a_pos - 1; // will be +1 by the for loop
continue;
}
else {
return i;
}
}
else if (ai == bi) {
continue;
}
else {
return i;
}
}
return max_idx; // all tokens are equal
}
// make sure all text tokens are within the vocab range
bool validate(const struct llama_context* ctx) const {
const llama_model* model = llama_get_model(ctx);
const llama_vocab* vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
for (size_t i = 0; i < tokens.size(); ++i) {
auto& t = tokens[i];
if (t == LLAMA_TOKEN_NULL) {
try {
const auto& chunk = find_chunk(i);
size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
i += n_pos - 1; // will be +1 by the for loop
}
catch (const std::exception& e) {
return false;
}
}
else if (t < 0 || t >= n_vocab) {
return false;
}
}
return true;
}
// encode and decode the image chunk
int32_t process_chunk(
llama_context* ctx,
mtmd_context* mctx,
llama_pos n_past,
int32_t seq_id,
llama_pos& n_pos_out) {
auto& chunk = find_chunk(n_past);
const char* name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
? "image" : "audio";
LOG_INFO("processing %s...\n", name);
int32_t n_batch = llama_n_batch(ctx);
int64_t t0 = ggml_time_ms();
llama_pos new_n_past = n_past;
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
chunk.get(),
n_past,
seq_id,
n_batch,
true, // logits last
&new_n_past);
LOG_INFO("processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
if (result != 0) {
LOG_ERROR("mtmd_helper_eval failed with status %d", result);
n_pos_out = n_past;
return result;
}
n_pos_out = new_n_past;
return 0;
}
};
// Computes FNV-1a hash of the data
static std::string fnv_hash(const uint8_t* data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;
for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return std::to_string(hash);
}
static server_tokens process_mtmd_prompt(mtmd_context* mctx, std::string prompt, std::vector<raw_buffer> files) {
mtmd::bitmaps bitmaps;
for (auto& file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
// process prompt
std::vector<server_tokens> inputs;
// multimodal
mtmd_input_text inp_txt = {
prompt.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
auto result = server_tokens(chunks, true);
return result;
}
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* use tokenize_input_prompts() if the input could be an array.
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
*/
static server_tokens tokenize_input_subprompt(const llama_vocab* vocab, mtmd_context* mctx, const json& json_prompt, bool add_special, bool parse_special) {
constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
const bool has_mtmd = mctx != nullptr;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed
std::vector<llama_token> tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
return server_tokens(tmp, false);
}
else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens
std::vector<llama_token> tmp = json_prompt.get<std::vector<llama_token>>();
return server_tokens(tmp, false);
}
else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
// JSON object with prompt key.
if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
if (!has_mtmd)
throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
// JSON object with prompt and multimodal key.
std::vector<raw_buffer> files;
for (const auto& entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
files.push_back(base64_decode(entry));
}
return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
}
else {
// Not multimodal, but contains a subobject.
std::vector<llama_token> tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
return server_tokens(tmp, false);
}
}
else {
throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
}
}
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
* and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
*/
static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab* vocab, mtmd_context* mctx, const json& json_prompt, bool add_special, bool parse_special) {
std::vector<server_tokens> result;
if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
result.reserve(json_prompt.size());
for (const auto& p : json_prompt) {
result.push_back(tokenize_input_subprompt(vocab, mctx, p, add_special, parse_special));
}
}
else {
result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
}
if (result.empty()) {
throw std::runtime_error("\"prompt\" must not be empty");
}
return result;
}
// Assuming raw_buffer has .data() and .size() members
inline void printFilesInfo(const std::vector<raw_buffer>& files) {
for (size_t i = 0; i < files.size(); ++i) {
const auto& file = files[i];
std::cout << "File " << i << ": Size = " << file.size() << " bytes\n";
// Print first 16 bytes in hex
std::cout << "First 16 bytes: ";
for (size_t j = 0; j < std::min<size_t>(file.size(), 16); ++j) {
std::cout << std::hex << std::setw(2) << std::setfill('0')
<< static_cast<int>(file.data()[j]) << " ";
}
std::cout << std::dec << "\n\n"; // Reset to decimal
}
}

File diff suppressed because one or more lines are too long

View File

@@ -19,9 +19,11 @@
"dexie-export-import": "^4.0.11",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"pdfjs-dist": "^5.2.133",
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-dropzone": "^14.3.8",
"react-hot-toast": "^2.5.2",
"react-markdown": "^9.0.3",
"react-router": "^7.1.5",
@@ -1036,6 +1038,191 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@napi-rs/canvas": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz",
"integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==",
"license": "MIT",
"optional": true,
"workspaces": [
"e2e/*"
],
"engines": {
"node": ">= 10"
},
"optionalDependencies": {
"@napi-rs/canvas-android-arm64": "0.1.80",
"@napi-rs/canvas-darwin-arm64": "0.1.80",
"@napi-rs/canvas-darwin-x64": "0.1.80",
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.80",
"@napi-rs/canvas-linux-arm64-gnu": "0.1.80",
"@napi-rs/canvas-linux-arm64-musl": "0.1.80",
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.80",
"@napi-rs/canvas-linux-x64-gnu": "0.1.80",
"@napi-rs/canvas-linux-x64-musl": "0.1.80",
"@napi-rs/canvas-win32-x64-msvc": "0.1.80"
}
},
"node_modules/@napi-rs/canvas-android-arm64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.80.tgz",
"integrity": "sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-arm64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.80.tgz",
"integrity": "sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-x64": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.80.tgz",
"integrity": "sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.80.tgz",
"integrity": "sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.80.tgz",
"integrity": "sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.80.tgz",
"integrity": "sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.80.tgz",
"integrity": "sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.80.tgz",
"integrity": "sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-musl": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.80.tgz",
"integrity": "sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
"version": "0.1.80",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.80.tgz",
"integrity": "sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -2060,6 +2247,15 @@
"dev": true,
"license": "Python-2.0"
},
"node_modules/attr-accept": {
"version": "2.2.5",
"resolved": "https://registry.npmjs.org/attr-accept/-/attr-accept-2.2.5.tgz",
"integrity": "sha512-0bDNnY/u6pPwHDMoF0FieU354oBi0a8rD9FcsLwzcGWbc8KS8KPIi7y+s13OlVY+gMWc/9xEMUgNE6Qm8ZllYQ==",
"license": "MIT",
"engines": {
"node": ">=4"
}
},
"node_modules/autoprefixer": {
"version": "10.4.20",
"resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
@@ -2815,6 +3011,18 @@
"node": ">=16.0.0"
}
},
"node_modules/file-selector": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/file-selector/-/file-selector-2.1.2.tgz",
"integrity": "sha512-QgXo+mXTe8ljeqUFaX3QVHc5osSItJ/Km+xpocx0aSqWGMSCf6qYs/VnzZgS864Pjn5iceMRFigeAV7AfTlaig==",
"license": "MIT",
"dependencies": {
"tslib": "^2.7.0"
},
"engines": {
"node": ">= 12"
}
},
"node_modules/fill-range": {
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
@@ -4694,6 +4902,15 @@
"node": ">=0.10.0"
}
},
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
"integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/optionator": {
"version": "0.9.4",
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4814,6 +5031,18 @@
"node": ">=8"
}
},
"node_modules/pdfjs-dist": {
"version": "5.4.149",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.149.tgz",
"integrity": "sha512-Xe8/1FMJEQPUVSti25AlDpwpUm2QAVmNOpFP0SIahaPIOKBKICaefbzogLdwey3XGGoaP4Lb9wqiw2e9Jqp0LA==",
"license": "Apache-2.0",
"engines": {
"node": ">=20.16.0 || >=22.3.0"
},
"optionalDependencies": {
"@napi-rs/canvas": "^0.1.77"
}
},
"node_modules/picocolors": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
@@ -4892,6 +5121,17 @@
"url": "https://github.com/prettier/prettier?sponsor=1"
}
},
"node_modules/prop-types": {
"version": "15.8.1",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
"integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
"license": "MIT",
"dependencies": {
"loose-envify": "^1.4.0",
"object-assign": "^4.1.1",
"react-is": "^16.13.1"
}
},
"node_modules/property-information": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
@@ -4958,6 +5198,23 @@
"react": "^18.3.1"
}
},
"node_modules/react-dropzone": {
"version": "14.3.8",
"resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz",
"integrity": "sha512-sBgODnq+lcA4P296DY4wacOZz3JFpD99fp+hb//iBO2HHnyeZU3FwWyXJ6salNpqQdsZrgMrotuko/BdJMV8Ug==",
"license": "MIT",
"dependencies": {
"attr-accept": "^2.2.4",
"file-selector": "^2.1.0",
"prop-types": "^15.8.1"
},
"engines": {
"node": ">= 10.13"
},
"peerDependencies": {
"react": ">= 16.8 || 18.0.0"
}
},
"node_modules/react-hot-toast": {
"version": "2.5.2",
"resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.5.2.tgz",
@@ -4975,6 +5232,12 @@
"react-dom": ">=16"
}
},
"node_modules/react-is": {
"version": "16.13.1",
"resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
"integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
"license": "MIT"
},
"node_modules/react-markdown": {
"version": "9.0.3",
"resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz",
@@ -5851,7 +6114,6 @@
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"devOptional": true,
"license": "0BSD"
},
"node_modules/turbo-stream": {

View File

@@ -22,9 +22,11 @@
"dexie-export-import": "^4.0.11",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"pdfjs-dist": "^5.2.133",
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-dropzone": "^14.3.8",
"react-hot-toast": "^2.5.2",
"react-markdown": "^9.0.3",
"react-router": "^7.1.5",

View File

@@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
pasteLongTextToFileLen: 2500,
pdfAsImage: false,
reasoning_format: 'auto',
// make sure these default values are in sync with `common.h`
samplers: 'dkypmxnt',
@@ -46,6 +48,8 @@ export const CONFIG_INFO: Record<string, string> = {
reasoning_format : 'Specify how to parse reasoning content. none: reasoning content in content block. auto: reasoning content in reasoning_content. ',
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
pasteLongTextToFileLen:
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->top_sigma->temperature',
temperature:

View File

@@ -0,0 +1,135 @@
import {
DocumentTextIcon,
SpeakerWaveIcon,
XMarkIcon,
} from '@heroicons/react/24/outline';
import { MessageExtra } from '../utils/types';
import { useState } from 'react';
import { classNames } from '../utils/misc';
export default function ChatInputExtraContextItem({
items,
removeItem,
clickToShow,
}: {
items?: MessageExtra[];
removeItem?: (index: number) => void;
clickToShow?: boolean;
}) {
const [show, setShow] = useState(-1);
const showingItem = show >= 0 ? items?.[show] : undefined;
if (!items) return null;
return (
<div
className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
role="group"
aria-description="Selected files"
>
{items.map((item, i) => (
<div
className="indicator"
key={i}
onClick={() => clickToShow && setShow(i)}
tabIndex={0}
aria-description={
clickToShow ? `Click to show: ${item.name}` : undefined
}
role={clickToShow ? 'button' : 'menuitem'}
>
{removeItem && (
<div className="indicator-item indicator-top">
<button
aria-label="Remove file"
className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
onClick={() => removeItem(i)}
>
<XMarkIcon className="h-3 w-3" />
</button>
</div>
)}
<div
className={classNames({
'flex flex-row rounded-md shadow-sm items-center m-0 p-0': true,
'cursor-pointer hover:shadow-md': !!clickToShow,
})}
>
{item.type === 'imageFile' ? (
<>
<img
src={item.base64Url}
alt={`Preview image for ${item.name}`}
className="w-14 h-14 object-cover rounded-md"
/>
</>
) : (
<>
<div
className="w-14 h-14 flex items-center justify-center"
aria-description="Document icon"
>
{item.type === 'audioFile' ? (
<SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
) : (
<DocumentTextIcon className="h-8 w-8 text-gray-500" />
)}
</div>
<div className="text-xs pr-4">
<b>{item.name ?? 'Extra content'}</b>
</div>
</>
)}
</div>
</div>
))}
{showingItem && (
<dialog
className="modal modal-open"
aria-description={`Preview ${showingItem.name}`}
>
<div className="modal-box">
<div className="flex justify-between items-center mb-4">
<b>{showingItem.name ?? 'Extra content'}</b>
<button
className="btn btn-ghost btn-sm"
aria-label="Close preview dialog"
>
<XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
</button>
</div>
{showingItem.type === 'imageFile' ? (
<img
src={showingItem.base64Url}
alt={`Preview image for ${showingItem.name}`}
/>
) : showingItem.type === 'audioFile' ? (
<audio
controls
className="w-full"
aria-description={`Audio file ${showingItem.name}`}
>
<source
src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
type={showingItem.mimeType}
aria-description={`Audio file ${showingItem.name}`}
/>
Your browser does not support the audio element.
</audio>
) : (
<div className="overflow-x-auto">
<pre className="whitespace-pre-wrap break-words text-sm">
{showingItem.content}
</pre>
</div>
)}
</div>
<div className="modal-backdrop" onClick={() => setShow(-1)}></div>
</dialog>
)}
</div>
);
}

View File

@@ -3,7 +3,8 @@ import { useAppContext } from '../utils/app.context';
import { Message, PendingMessage } from '../utils/types';
import { classNames } from '../utils/misc';
import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline';
import { ChevronLeftIcon, ChevronRightIcon, ArrowPathIcon, PencilSquareIcon } from '@heroicons/react/24/outline';
import ChatInputExtraContextItem from './ChatInputExtraContextItem';
interface SplitMessage {
content: PendingMessage['content'];
@@ -82,7 +83,11 @@ export default function ChatMessage({
if (!viewingChat) return null;
return (
<div className="group" id={id}>
<div className="group"
id={id}
role="group"
aria-description={`Message from ${msg.role}`}
>
<div
className={classNames({
chat: true,
@@ -90,9 +95,13 @@ export default function ChatMessage({
'chat-end': msg.role === 'user',
})}
>
{msg.extra && msg.extra.length > 0 && (
<ChatInputExtraContextItem items={msg.extra} clickToShow />
)}
<div
className={classNames({
'chat-bubble markdown': true,
'chat-bubble chat-bubble-primary': true,
'chat-bubble-base-300': msg.role !== 'user',
})}
>
@@ -168,35 +177,6 @@ export default function ChatMessage({
</div>
</details>
)}
{msg.extra && msg.extra.length > 0 && (
<details
className={classNames({
'collapse collapse-arrow mb-4 bg-base-200': true,
'bg-opacity-10': msg.role !== 'assistant',
})}
>
<summary className="collapse-title">
Extra content
</summary>
<div className="collapse-content">
{msg.extra.map(
(extra, i) =>
extra.type === 'textFile' ? (
<div key={extra.name}>
<b>{extra.name}</b>
<pre>{extra.content}</pre>
</div>
) : extra.type === 'context' ? (
<div key={i}>
<pre>{extra.content}</pre>
</div>
) : null // TODO: support other extra types
)}
</div>
</details>
)}
<MarkdownDisplay
content={content}
isGenerating={isPending}
@@ -273,7 +253,7 @@ export default function ChatMessage({
onClick={() => setEditingContent(msg.content)}
disabled={msg.content === null}
>
Edit
<PencilSquareIcon className="h-4 w-4" /> Edit
</button>
)}
{/* assistant message */}
@@ -289,7 +269,7 @@ export default function ChatMessage({
}}
disabled={msg.content === null}
>
🔄 Regenerate
<ArrowPathIcon className="h-4 w-4" /> Regenerate
</button>
)}
{!isPending && (
@@ -298,7 +278,7 @@ export default function ChatMessage({
onClick={() => setEditingContent(msg.content)}
disabled={msg.content === null}
>
Edit
<PencilSquareIcon className="h-4 w-4" /> Edit
</button>
)}
</>

View File

@@ -1,4 +1,4 @@
import { useEffect, useMemo, useState } from 'react';
import { ClipboardEvent, useEffect, useMemo, useState } from 'react';
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
import ChatMessage from './ChatMessage';
import { CanvasType, Message, PendingMessage } from '../utils/types';
@@ -7,7 +7,17 @@ import CanvasPyInterpreter from './CanvasPyInterpreter';
import StorageUtils from '../utils/storage';
import { useVSCodeContext } from '../utils/llama-vscode';
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
import {
ArrowUpIcon,
StopIcon,
PaperClipIcon,
} from '@heroicons/react/24/solid';
import {
ChatExtraContextApi,
useChatExtraContext,
} from './useChatExtraContext.tsx';
import Dropzone from 'react-dropzone';
import ChatInputExtraContextItem from './ChatInputExtraContextItem.tsx';
/**
* A message display is a message node with additional information for rendering.
* For example, siblings of the message node are stored as their last node (aka leaf node).
@@ -104,9 +114,10 @@ export default function ChatScreen() {
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
const extraContext = useChatExtraContext();
useVSCodeContext(textarea, extraContext);
//const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
// TODO: improve this when we have "upload file" feature
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
// keep track of leaf node for rendering
const [currNodeId, setCurrNodeId] = useState<number>(-1);
@@ -147,7 +158,7 @@ export default function ChatScreen() {
currConvId,
lastMsgNodeId,
lastInpMsg,
currExtra,
extraContext.items,
onChunk
))
) {
@@ -155,7 +166,7 @@ export default function ChatScreen() {
textarea.setValue(lastInpMsg);
}
// OK
clearExtraContext();
extraContext.clearItems();
};
const handleEditMessage = async (msg: Message, content: string) => {
@@ -282,42 +293,14 @@ export default function ChatScreen() {
})}
</div>
{/* chat input */}
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
<textarea
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
placeholder="Type a message (Shift+Enter to add a new line)"
ref={textarea.ref}
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
sendNewMessage();
}
}}
id="msg-input"
dir="auto"
// Set a base height of 2 rows for mobile views
// On lg+ screens, the hook will calculate and set the initial height anyway
rows={2}
></textarea>
{isGenerating(currConvId ?? '') ? (
<button
className="btn btn-neutral ml-2"
onClick={() => stopGenerating(currConvId ?? '')}
>
Stop
</button>
) : (
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
Send
</button>
)}
</div>
{/* chat input */}
<ChatInput
textarea={textarea}
extraContext={extraContext}
onSend={sendNewMessage}
onStop={() => stopGenerating(currConvId ?? '')}
isGenerating={isGenerating(currConvId ?? '')}
/>
</div>
<div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
{canvasData?.type === CanvasType.PY_INTERPRETER && (
@@ -327,3 +310,183 @@ export default function ChatScreen() {
</div>
);
}
// function ServerInfo() {
// const { serverProps } = useAppContext();
// const modalities = [];
// if (serverProps?.modalities?.audio) {
// modalities.push('audio');
// }
// if (serverProps?.modalities?.vision) {
// modalities.push('vision');
// }
// return (
// <div
// className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
// tabIndex={0}
// aria-description="Server information"
// >
// <div className="card-body">
// <b>Server Info</b>
// <p>
// <b>Model</b>: {serverProps?.model_path?.split(/(\\|\/)/).pop()}
// <br />
// {modalities.length > 0 ? (
// <>
// <b>Supported modalities:</b> {modalities.join(', ')}
// </>
// ) : (
// ''
// )}
// </p>
// </div>
// </div>
// );
// }
function ChatInput({
textarea,
extraContext,
onSend,
onStop,
isGenerating,
}: {
textarea: ChatTextareaApi;
extraContext: ChatExtraContextApi;
onSend: () => void;
onStop: () => void;
isGenerating: boolean;
}) {
const { config } = useAppContext();
const [isDrag, setIsDrag] = useState(false);
return (
<div
role="group"
aria-label="Chat input"
className={classNames({
'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
})}
>
<Dropzone
noClick
onDrop={(files: File[]) => {
setIsDrag(false);
extraContext.onFileAdded(files);
}}
onDragEnter={() => setIsDrag(true)}
onDragLeave={() => setIsDrag(false)}
multiple={true}
>
{({ getRootProps, getInputProps }) => (
<div
className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
// when a file is pasted to the input, we handle it here
// if a text is pasted, and if it is long text, we will convert it to a file
onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
const text = e.clipboardData.getData('text/plain');
if (
text.length > 0 &&
config.pasteLongTextToFileLen > 0 &&
text.length > config.pasteLongTextToFileLen
) {
// if the text is too long, we will convert it to a file
extraContext.addItems([
{
type: 'context',
name: 'Pasted Content',
content: text,
},
]);
e.preventDefault();
return;
}
// if a file is pasted, we will handle it here
const files = Array.from(e.clipboardData.items)
.filter((item) => item.kind === 'file')
.map((item) => item.getAsFile())
.filter((file) => file !== null);
if (files.length > 0) {
e.preventDefault();
extraContext.onFileAdded(files);
}
}}
{...getRootProps()}
>
{!isGenerating && (
<ChatInputExtraContextItem
items={extraContext.items}
removeItem={extraContext.removeItem}
/>
)}
<div className="flex flex-row w-full">
<textarea
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
className="text-md outline-none border-none w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
placeholder="Type a message..."
ref={textarea.ref}
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
onSend();
}
}}
id="msg-input"
dir="auto"
// Set a base height of 2 rows for mobile views
// On lg+ screens, the hook will calculate and set the initial height anyway
rows={2}
></textarea>
{/* buttons area */}
<div className="flex flex-row gap-2 ml-2">
<label
htmlFor="file-upload"
className={classNames({
'btn w-8 h-8 p-0 rounded-full': true,
'btn-disabled': isGenerating,
})}
aria-label="Upload file"
tabIndex={0}
role="button"
>
<PaperClipIcon className="h-5 w-5" />
</label>
<input
id="file-upload"
type="file"
disabled={isGenerating}
{...getInputProps()}
hidden
/>
{isGenerating ? (
<button
className="btn btn-neutral w-8 h-8 p-0 rounded-full"
onClick={onStop}
>
<StopIcon className="h-5 w-5" />
</button>
) : (
<button
className="btn btn-primary w-8 h-8 p-0 rounded-full"
onClick={onSend}
aria-label="Send message"
>
<ArrowUpIcon className="h-5 w-5" />
</button>
)}
</div>
</div>
</div>
)}
</Dropzone>
</div>
);
}

View File

@@ -12,6 +12,7 @@ import {
ArrowDownTrayIcon,
PencilIcon,
TrashIcon,
MoonIcon,
} from '@heroicons/react/24/outline';
export default function Header() {
@@ -204,16 +205,7 @@ export default function Header() {
<div className="tooltip tooltip-bottom" data-tip="Themes">
<div className="dropdown dropdown-end dropdown-bottom">
<div tabIndex={0} role="button" className="btn m-1">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-palette2"
viewBox="0 0 16 16"
>
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
</svg>
<MoonIcon className="w-5 h-5" />
</div>
<ul
tabIndex={0}

View File

@@ -11,6 +11,7 @@ import { ElementContent, Root } from 'hast';
import { visit } from 'unist-util-visit';
import { useAppContext } from '../utils/app.context';
import { CanvasType } from '../utils/types';
import { DocumentDuplicateIcon, PlayIcon } from '@heroicons/react/24/outline';
export default function MarkdownDisplay({
content,
@@ -109,7 +110,8 @@ export const CopyButton = ({
}}
onMouseLeave={() => setCopied(false)}
>
{copied ? 'Copied!' : '📋 Copy'}
<DocumentDuplicateIcon className="h-4 w-4" />
{copied ? 'Copied!' : 'Copy'}
</button>
);
};
@@ -133,7 +135,8 @@ export const RunPyCodeButton = ({
})
}
>
Run
<PlayIcon className="h-4 w-4" />
{"Run"}
</button>
</>
);

View File

@@ -275,6 +275,16 @@ const SETTING_SECTIONS = (
key,
}) as SettingFieldInput
),
{
type: SettingInputType.SHORT_INPUT,
label: 'Paste length to file',
key: 'pasteLongTextToFileLen',
},
{
type: SettingInputType.CHECKBOX,
label: 'Parse PDF as image instead of text',
key: 'pdfAsImage',
},
],
},
{

View File

@@ -0,0 +1,371 @@
import { useState } from 'react';
import { MessageExtra } from '../utils/types';
import toast from 'react-hot-toast';
import { useAppContext } from '../utils/app.context';
import * as pdfjs from 'pdfjs-dist';
import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
// This file handles uploading extra context items (a.k.a files)
// It allows processing these kinds of files:
// - image files (converted to base64)
// - audio files (converted to base64)
// - text files (including code files)
// - pdf (converted to text)
// Interface describing the API returned by the hook
export interface ChatExtraContextApi {
items?: MessageExtra[]; // undefined if empty, similar to Message['extra']
addItems: (items: MessageExtra[]) => void;
removeItem: (idx: number) => void;
clearItems: () => void;
onFileAdded: (files: File[]) => void; // used by "upload" button
}
export function useChatExtraContext(): ChatExtraContextApi {
const { serverProps, config } = useAppContext();
const [items, setItems] = useState<MessageExtra[]>([]);
const addItems = (newItems: MessageExtra[]) => {
setItems((prev) => [...prev, ...newItems]);
};
const removeItem = (idx: number) => {
setItems((prev) => prev.filter((_, i) => i !== idx));
};
const clearItems = () => {
setItems([]);
};
const isSupportVision = serverProps?.modalities?.vision;
const onFileAdded = async (files: File[]) => {
try {
for (const file of files) {
const mimeType = file.type;
// this limit is only to prevent accidental uploads of huge files
// it can potentially crashes the browser because we read the file as base64
if (file.size > 500 * 1024 * 1024) {
toast.error('File is too large. Maximum size is 500MB.');
break;
}
if (mimeType.startsWith('image/')) {
if (!isSupportVision) {
toast.error('Multimodal is not supported by this server or model.');
break;
}
let base64Url = await getFileAsBase64(file);
if (mimeType === 'image/svg+xml') {
// Convert SVG to PNG
base64Url = await svgBase64UrlToPngDataURL(base64Url);
}
addItems([
{
type: 'imageFile',
name: file.name,
base64Url,
},
]);
} else if (mimeType.startsWith('video/')) {
toast.error('Video files are not supported yet.');
break;
} else if (mimeType.startsWith('audio/')) {
if (!/mpeg|wav/.test(mimeType)) {
toast.error('Only mp3 and wav audio files are supported.');
break;
}
// plain base64, not a data URL
const base64Data = await getFileAsBase64(file, false);
addItems([
{
type: 'audioFile',
name: file.name,
mimeType,
base64Data,
},
]);
} else if (mimeType.startsWith('application/pdf')) {
if (config.pdfAsImage && !isSupportVision) {
toast(
'Multimodal is not supported, PDF will be converted to text instead of image.'
);
break;
}
if (config.pdfAsImage && isSupportVision) {
// Convert PDF to images
const base64Urls = await convertPDFToImage(file);
addItems(
base64Urls.map((base64Url) => ({
type: 'imageFile',
name: file.name,
base64Url,
}))
);
} else {
// Convert PDF to text
const content = await convertPDFToText(file);
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
if (isSupportVision) {
toast.success(
'PDF file converted to text. You can also convert it to image, see in Settings.'
);
}
}
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
// and will just check if the file is not binary.
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
const content = event.target.result as string;
if (!isLikelyNotBinary(content)) {
toast.error('File is binary. Please upload a text file.');
return;
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
}
};
reader.readAsText(file);
}
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const errorMessage = `Error processing file: ${message}`;
toast.error(errorMessage);
}
};
return {
items: items.length > 0 ? items : undefined,
addItems,
removeItem,
clearItems,
onFileAdded,
};
}
async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
let result = event.target.result as string;
if (!outputUrl) {
// remove base64 url prefix and correct characters
result = result.substring(result.indexOf(',') + 1);
}
resolve(result);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.readAsDataURL(file);
});
}
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
resolve(event.target.result as ArrayBuffer);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.readAsArrayBuffer(file);
});
}
async function convertPDFToText(file: File): Promise<string> {
const buffer = await getFileAsBuffer(file);
const pdf = await pdfjs.getDocument(buffer).promise;
const numPages = pdf.numPages;
const textContentPromises: Promise<TextContent>[] = [];
for (let i = 1; i <= numPages; i++) {
textContentPromises.push(
pdf.getPage(i).then((page) => page.getTextContent())
);
}
const textContents = await Promise.all(textContentPromises);
const textItems = textContents.flatMap((textContent: TextContent) =>
textContent.items.map((item) => (item as TextItem).str ?? '')
);
return textItems.join('\n');
}
// returns list of base64 images
async function convertPDFToImage(file: File): Promise<string[]> {
const buffer = await getFileAsBuffer(file);
const doc = await pdfjs.getDocument(buffer).promise;
const pages: Promise<string>[] = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const viewport = page.getViewport({ scale: 1.5 });
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
if (!ctx) {
throw new Error('Failed to get 2D context from canvas');
}
const task = page.render({ canvasContext: ctx, viewport: viewport });
pages.push(
task.promise.then(() => {
return canvas.toDataURL();
})
);
}
return await Promise.all(pages);
}
// WARN: vibe code below
// This code is a heuristic to determine if a string is likely not binary.
// It is necessary because input file can have various mime types which we don't have time to investigate.
// For example, a python file can be text/plain, application/x-python, etc.
function isLikelyNotBinary(str: string): boolean {
const options = {
prefixLength: 1024 * 10, // Check the first 10KB of the string
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
maxAbsoluteNullBytes: 2,
};
if (!str) {
return true; // Empty string is considered "not binary" or trivially text.
}
const sampleLength = Math.min(str.length, options.prefixLength);
if (sampleLength === 0) {
return true; // Effectively an empty string after considering prefixLength.
}
let suspiciousCharCount = 0;
let nullByteCount = 0;
for (let i = 0; i < sampleLength; i++) {
const charCode = str.charCodeAt(i);
// 1. Check for Unicode Replacement Character (U+FFFD)
// This is a strong indicator if the string was created from decoding bytes as UTF-8.
if (charCode === 0xfffd) {
suspiciousCharCount++;
continue;
}
// 2. Check for Null Bytes (U+0000)
if (charCode === 0x0000) {
nullByteCount++;
// We also count nulls towards the general suspicious character count,
// as they are less common in typical text files.
suspiciousCharCount++;
continue;
}
// 3. Check for C0 Control Characters (U+0001 to U+001F)
// Exclude common text control characters: TAB (9), LF (10), CR (13).
// We can also be a bit lenient with BEL (7) and BS (8) which sometimes appear in logs.
if (charCode < 32) {
if (
charCode !== 9 && // TAB
charCode !== 10 && // LF
charCode !== 13 && // CR
charCode !== 7 && // BEL (Bell) - sometimes in logs
charCode !== 8 // BS (Backspace) - less common, but possible
) {
suspiciousCharCount++;
}
}
// Characters from 32 (space) up to 126 (~) are printable ASCII.
// Characters 127 (DEL) is a control character.
// Characters >= 128 are extended ASCII / multi-byte Unicode.
// If they resulted in U+FFFD, we caught it. Otherwise, they are valid
// (though perhaps unusual) Unicode characters from JS's perspective.
// The main concern is if those higher characters came from misinterpreting
// a single-byte encoding as UTF-8, which again, U+FFFD would usually flag.
}
// Check absolute null byte count
if (nullByteCount > options.maxAbsoluteNullBytes) {
return false; // Too many null bytes is a strong binary indicator
}
// Check ratio of suspicious characters
const ratio = suspiciousCharCount / sampleLength;
return ratio <= options.suspiciousCharThresholdRatio;
}
// WARN: vibe code below
// Converts a Base64URL encoded SVG string to a PNG Data URL using browser Canvas API.
function svgBase64UrlToPngDataURL(base64UrlSvg: string): Promise<string> {
const backgroundColor = 'white'; // Default background color for PNG
return new Promise((resolve, reject) => {
try {
const img = new Image();
img.onload = () => {
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
if (!ctx) {
reject(new Error('Failed to get 2D canvas context.'));
return;
}
// Use provided dimensions or SVG's natural dimensions, with fallbacks
// Fallbacks (e.g., 300x300) are for SVGs without explicit width/height
// or when naturalWidth/Height might be 0 before full processing.
const targetWidth = img.naturalWidth || 300;
const targetHeight = img.naturalHeight || 300;
canvas.width = targetWidth;
canvas.height = targetHeight;
if (backgroundColor) {
ctx.fillStyle = backgroundColor;
ctx.fillRect(0, 0, canvas.width, canvas.height);
}
ctx.drawImage(img, 0, 0, targetWidth, targetHeight);
resolve(canvas.toDataURL('image/png'));
};
img.onerror = () => {
reject(
new Error('Failed to load SVG image. Ensure the SVG data is valid.')
);
};
// Load SVG string into an Image element
img.src = base64UrlSvg;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const errorMessage = `Error converting SVG to PNG: ${message}`;
toast.error(errorMessage);
reject(new Error(errorMessage));
}
});
}

View File

@@ -37,6 +37,7 @@ export interface ChatTextareaApi {
setValue: (value: string) => void;
focus: () => void;
ref: React.RefObject<HTMLTextAreaElement>;
refOnSubmit: React.MutableRefObject<(() => void) | null>; // Submit handler
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
}
@@ -46,7 +47,7 @@ export interface ChatTextareaApi {
export function useChatTextarea(initValue: string): ChatTextareaApi {
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
const textareaRef = useRef<HTMLTextAreaElement>(null);
const onSubmitRef = useRef<(() => void) | null>(null);
// Effect to set initial value and height on mount or when initValue changes
useEffect(() => {
const textarea = textareaRef.current;
@@ -91,6 +92,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
}
},
ref: textareaRef,
refOnSubmit: onSubmitRef,
onInput: handleInput,
};
}

View File

@@ -3,6 +3,7 @@ import {
APIMessage,
CanvasData,
Conversation,
LlamaCppServerProps,
Message,
PendingMessage,
ViewingChat,
@@ -12,6 +13,7 @@ import {
filterThoughtFromMsgs,
normalizeMsgsForAPI,
getSSEStreamAsync,
getServerProps
} from './misc';
import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
import { matchPath, useLocation, useNavigate } from 'react-router';
@@ -54,6 +56,10 @@ interface AppContextValue {
saveConfig: (config: typeof CONFIG_DEFAULT) => void;
showSettings: boolean;
setShowSettings: (show: boolean) => void;
// props
serverProps: LlamaCppServerProps | null;
}
// this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -82,6 +88,9 @@ export const AppContextProvider = ({
const params = matchPath('/chat/:convId', pathname);
const convId = params?.params?.convId;
const [serverProps, setServerProps] = useState<LlamaCppServerProps | null>(
null
);
const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
const [pendingMessages, setPendingMessages] = useState<
Record<Conversation['id'], PendingMessage>
@@ -93,6 +102,20 @@ export const AppContextProvider = ({
const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
const [showSettings, setShowSettings] = useState(false);
// get server props
useEffect(() => {
getServerProps(BASE_URL, config.apiKey)
.then((props) => {
console.debug('Server props:', props);
setServerProps(props);
})
.catch((err) => {
console.error(err);
toast.error('Failed to fetch server props');
});
// eslint-disable-next-line
}, []);
// handle change when the convId from URL is changed
useEffect(() => {
// also reset the canvas data
@@ -469,6 +492,7 @@ export const AppContextProvider = ({
saveConfig,
showSettings,
setShowSettings,
serverProps,
}}
>
{children}

View File

@@ -1,6 +1,6 @@
import { useEffect, useState } from 'react';
import { MessageExtraContext } from './types';
import { useEffect } from 'react';
import { ChatTextareaApi } from '../components/useChatTextarea.ts';
import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx';
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,11 +15,10 @@ interface SetTextEvData {
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
*/
export const useVSCodeContext = (textarea: ChatTextareaApi) => {
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
null
);
export const useVSCodeContext = (
textarea: ChatTextareaApi,
extraContext: ChatExtraContextApi
) => {
// Accept setText message from a parent window and set inputMsg and extraContext
useEffect(() => {
const handleMessage = (event: MessageEvent) => {
@@ -27,18 +26,25 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
const data: SetTextEvData = event.data;
textarea.setValue(data?.text);
if (data?.context && data.context.length > 0) {
setExtraContext({
type: 'context',
content: data.context,
});
extraContext.clearItems();
extraContext.addItems([
{
type: 'context',
name: 'Extra context',
content: data.context,
},
]);
}
textarea.focus();
setTimeout(() => {
textarea.refOnSubmit.current?.();
}, 10); // wait for setExtraContext to finish
}
};
window.addEventListener('message', handleMessage);
return () => window.removeEventListener('message', handleMessage);
}, [textarea]);
}, [textarea, extraContext]);
// Add a keydown listener that sends the "escapePressed" message to the parent window
useEffect(() => {
@@ -52,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
return () => window.removeEventListener('keydown', handleKeyDown);
}, []);
return {
extraContext,
// call once the user message is sent, to clear the extra context
clearExtraContext: () => setExtraContext(null),
};
return {};
};

View File

@@ -1,6 +1,6 @@
// @ts-expect-error this package does not have typing
import TextLineStream from 'textlinestream';
import { APIMessage, Message } from './types';
import { APIMessage, Message, LlamaCppServerProps, APIMessageContentPart } from './types';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
@@ -57,21 +57,55 @@ export const copyStr = (textToCopy: string) => {
*/
export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
return messages.map((msg) => {
let newContent = '';
if (msg.role !== 'user' || !msg.extra) {
return {
role: msg.role,
content: msg.content,
} as APIMessage;
}
// extra content first, then user text message in the end
// this allow re-using the same cache prefix for long context
const contentArr: APIMessageContentPart[] = [];
for (const extra of msg.extra ?? []) {
if (extra.type === 'context') {
if (extra.content!='') {
newContent += `${extra.content}\n\n`;
}
contentArr.push({
type: 'text',
text: extra.content,
});
} else if (extra.type === 'textFile') {
contentArr.push({
type: 'text',
text: `File: ${extra.name}\nContent:\n\n${extra.content}`,
});
} else if (extra.type === 'imageFile') {
contentArr.push({
type: 'image_url',
image_url: { url: extra.base64Url },
});
} else if (extra.type === 'audioFile') {
contentArr.push({
type: 'input_audio',
input_audio: {
data: extra.base64Data,
format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
},
});
} else {
throw new Error('Unknown extra type');
}
}
newContent += msg.content;
// add user message to the end
contentArr.push({
type: 'text',
text: msg.content,
});
return {
role: msg.role,
content: newContent,
content: contentArr,
};
}) as APIMessage[];
}
@@ -137,3 +171,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => {
});
window.history.replaceState({}, '', url.toString());
};
export const getServerProps = async (
baseUrl: string,
apiKey?: string
): Promise<LlamaCppServerProps> => {
try {
const response = await fetch(`${baseUrl}/props`, {
headers: {
'Content-Type': 'application/json',
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
},
});
if (!response.ok) {
throw new Error('Failed to fetch server props');
}
const data = await response.json();
return data as LlamaCppServerProps;
} catch (error) {
console.error('Error fetching server props:', error);
throw error;
}
};

View File

@@ -48,7 +48,11 @@ export interface Message {
children: Message['id'][];
}
type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future
export type MessageExtra =
| MessageExtraTextFile
| MessageExtraImageFile
| MessageExtraAudioFile
| MessageExtraContext;
export interface MessageExtraTextFile {
type: 'textFile';
@@ -56,12 +60,43 @@ export interface MessageExtraTextFile {
content: string;
}
export interface MessageExtraImageFile {
type: 'imageFile';
name: string;
base64Url: string;
}
export interface MessageExtraAudioFile {
type: 'audioFile';
name: string;
base64Data: string;
mimeType: string;
}
export interface MessageExtraContext {
type: 'context';
name: string;
content: string;
}
export type APIMessage = Pick<Message, 'role' | 'content'>;
export type APIMessageContentPart =
| {
type: 'text';
text: string;
}
| {
type: 'image_url';
image_url: { url: string };
}
| {
type: 'input_audio';
input_audio: { data: string; format: 'wav' | 'mp3' };
};
export type APIMessage = {
role: Message['role'];
content: string | APIMessageContentPart[];
};
export interface Conversation {
id: string; // format: `conv-{timestamp}`
@@ -96,4 +131,15 @@ export interface SettingsPreset {
name: string;
createdAt: number; // timestamp from Date.now()
config: Record<string, string | number | boolean>; // partial CONFIG_DEFAULT
}
// a non-complete list of props, only contains the ones we need
export interface LlamaCppServerProps {
model_path: string;
n_ctx: number;
modalities?: {
vision: boolean;
audio: boolean;
};
// TODO: support params
}

View File

@@ -7,7 +7,7 @@ import zlib from 'node:zlib';
/* eslint-disable */
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
const MAX_BUNDLE_SIZE = 2 * 1024 * 1024; // only increase when absolutely necessary
const GUIDE_FOR_FRONTEND = `
<!--

View File

@@ -99,6 +99,18 @@ ggml_cgraph * llm_build_context::build_k_shift() {
GGML_ASSERT(kv_self.size == n_ctx);
const auto & rope_type_shift = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;
const float yarn_attn_factor_shift = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;
lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
cb(lctx.inp_K_shift, "K_shift", -1);
ggml_set_input(lctx.inp_K_shift);
@@ -127,15 +139,15 @@ ggml_cgraph * llm_build_context::build_k_shift() {
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
lctx.inp_K_shift, rope_factors, n_rot, rope_type_shift, n_ctx_orig, freq_base, freq_scale,
ext_factor, yarn_attn_factor_shift, beta_fast, beta_slow);
cb(tmp, "K_shifted_f32", il);
tmp = ggml_cpy(ctx0, tmp, k);
} else {
// we rotate only the first n_rot dimensions
tmp = ggml_rope_ext_inplace(ctx0, k,
lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
lctx.inp_K_shift, rope_factors, n_rot, rope_type_shift, n_ctx_orig, freq_base, freq_scale,
ext_factor, yarn_attn_factor_shift, beta_fast, beta_slow);
}
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp);