Fix embedding missing, CORS and crash using verbose in server (#924)

* server: fix crash when prompt has image and is too long * server: fix CORS * server: fix empty result for embedding * change error message to truncate prompt * server: fix slot id for save and load state * bug fix * server: update slot similarity to handle mtmd * server: quick hack to calculate number of token processed with image * server: fix out of range error when detokenizing prompt under verbose * Add back Access-Control-Allow-Origin * Server: Add prompt tokens in embedding results --------- Co-authored-by: firecoperana <firecoperana>
2026-04-27 09:53:40 +00:00 · 2025-11-09 12:16:03 +00:00
parent 5cc15d0ecf
commit b63309a918
3 changed files with 139 additions and 91 deletions
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1304,11 +1304,12 @@ public:

    // encode and decode the image chunk
    int32_t process_chunk(
-        llama_context* ctx,
-        mtmd_context* mctx,
+        llama_context * ctx,
+        mtmd_context * mctx,
        llama_pos n_past,
        int32_t seq_id,
-        llama_pos& n_pos_out) {
+        llama_pos & n_pos_out,
+        size_t & n_tokens_out) {
        char buffer[512];
        auto& chunk = find_chunk(n_past);
        const char* name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
@@ -1325,21 +1326,25 @@ public:
            n_batch,
            true, // logits last
            &new_n_past);
+        // get number of tokens in the image
+        const size_t new_n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get());
        snprintf(buffer, 512, "processed in %g ms", 1.*(ggml_time_ms() - t0));
        LOG_INFO(buffer, {});
        if (result != 0) {
            snprintf(buffer, 512, "mtmd_helper_eval failed with status %d", result);
            LOG_ERROR(buffer, {});
            n_pos_out = n_past;
+            n_tokens_out = 0;
            return result;
        }
        n_pos_out = new_n_past;
+        n_tokens_out = new_n_tokens;
        return 0;
    }
 };

 // Computes FNV-1a hash of the data
-static std::string fnv_hash(const uint8_t* data, size_t len) {
+static std::string fnv_hash(const uint8_t * data, size_t len) {
    const uint64_t fnv_prime = 0x100000001b3ULL;
    uint64_t hash = 0xcbf29ce484222325ULL;

@@ -1350,7 +1355,7 @@ static std::string fnv_hash(const uint8_t* data, size_t len) {
    return std::to_string(hash);
 }

-static server_tokens process_mtmd_prompt(mtmd_context* mctx, std::string prompt, std::vector<raw_buffer> files) {
+static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
    mtmd::bitmaps bitmaps;
    for (auto& file : files) {
        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));