mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 10:21:48 +00:00
Port mdmd from mainline + Qwen2/2.5-VL support (#798)
* Add mtmd: the beginning * Add mtmd: mtmd.cpp compiles * Add mtmd: clip initialization compiles * Add mtmd: clip.cpp compiles * Add mtmd: builds successfully * Add CPU implementation for GGML_OP_GLU * Add CUDA implementation for GGML_OP_GLU * Add CPU implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW * Add CUDA implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW * Add mtmd: refresh CPU rope * Add mtmd: refresh CUDA rope * Add mtmd: add Qwen2-VL * Add mtmd: Qwen2.5-VL text seems to work with this change * Add mtmd: fix swiglu * Add mtmd: use LOG_TEE so generated tokens show up in terminal * Add mtmd: do not attempt to load a GPU backend if none are available * GLU, not GPU * Fix typo * Fix new/free mismatch * LOG stuff * Add mtmd: this fixes gibberish on second image --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -29,7 +29,6 @@ else()
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
@@ -39,6 +38,7 @@ else()
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(mtmd)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
|
||||
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.conversation) {
|
||||
if (params.enable_chat_template) {
|
||||
//LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
|
||||
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
|
||||
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, {}).c_str());
|
||||
} else {
|
||||
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
|
||||
62
examples/mtmd/CMakeLists.txt
Normal file
62
examples/mtmd/CMakeLists.txt
Normal file
@@ -0,0 +1,62 @@
|
||||
# mtmd
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
add_library(mtmd
|
||||
mtmd.cpp
|
||||
mtmd-audio.cpp
|
||||
mtmd.h
|
||||
clip.cpp
|
||||
clip.h
|
||||
clip-impl.h
|
||||
mtmd-helper.cpp
|
||||
mtmd-helper.h
|
||||
)
|
||||
|
||||
target_link_libraries (mtmd PUBLIC ggml llama)
|
||||
target_link_libraries (mtmd PRIVATE Threads::Threads)
|
||||
target_include_directories(mtmd PUBLIC .)
|
||||
target_include_directories(mtmd PRIVATE ../..)
|
||||
target_include_directories(mtmd PRIVATE ../../vendor)
|
||||
target_compile_features (mtmd PRIVATE cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
|
||||
target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
|
||||
endif()
|
||||
|
||||
set(MTMD_PUBLIC_HEADERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
|
||||
)
|
||||
|
||||
set_target_properties(mtmd
|
||||
PROPERTIES
|
||||
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
|
||||
|
||||
install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
|
||||
|
||||
if (NOT MSVC)
|
||||
# for stb_image.h and miniaudio.h
|
||||
target_compile_options(mtmd PRIVATE -Wno-cast-qual)
|
||||
endif()
|
||||
|
||||
if (TARGET BUILD_INFO)
|
||||
add_dependencies(mtmd BUILD_INFO)
|
||||
add_dependencies(mtmd-helper BUILD_INFO)
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
63
examples/mtmd/README.md
Normal file
63
examples/mtmd/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
# Multimodal Support in llama.cpp
|
||||
|
||||
This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
|
||||
|
||||
> [!IMPORTANT]
|
||||
>
|
||||
> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
|
||||
|
||||
The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
|
||||
|
||||
- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
|
||||
- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
|
||||
- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
|
||||
- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
|
||||
- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
See the list of pre-quantized model [here](../../docs/multimodal.md)
|
||||
|
||||
## How it works and what is `mmproj`?
|
||||
|
||||
Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
|
||||
|
||||
This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
|
||||
|
||||
Consequently, running a multimodal model typically requires two GGUF files:
|
||||
1. The standard language model file.
|
||||
2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
|
||||
|
||||
## What is `libmtmd`?
|
||||
|
||||
As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
|
||||
|
||||
Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
|
||||
- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
|
||||
- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
|
||||
- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
|
||||
|
||||
## How to obtain `mmproj`
|
||||
|
||||
Multimodal projector (`mmproj`) files are specific to each model architecture.
|
||||
|
||||
For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
|
||||
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
|
||||
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
|
||||
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
|
||||
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
|
||||
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
|
||||
- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
|
||||
|
||||
For older models, please refer to the relevant guide for instructions on how to obtain or create them:
|
||||
|
||||
NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
|
||||
|
||||
- [LLaVA](../../docs/multimodal/llava.md)
|
||||
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
|
||||
- [GLM-Edge](../../docs/multimodal/glmedge.md)
|
||||
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
|
||||
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
|
||||
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
|
||||
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
|
||||
474
examples/mtmd/clip-impl.h
Normal file
474
examples/mtmd/clip-impl.h
Normal file
@@ -0,0 +1,474 @@
|
||||
#include "ggml.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder"
|
||||
#define KEY_HAS_VISION_ENC "clip.has_vision_encoder"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_USE_SILU "clip.use_silu"
|
||||
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
|
||||
// vision-specific
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
//
|
||||
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||
|
||||
// mimicpmv
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
#define TN_MINICPMV_QUERY "resampler.query"
|
||||
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
||||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||
|
||||
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
|
||||
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
|
||||
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
|
||||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
|
||||
// ultravox
|
||||
#define TN_CONV1D "a.conv1d.%d.%s"
|
||||
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
||||
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
||||
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
||||
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_MINICPMV,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_ULTRAVOX,
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
for (const auto & pair : PROJECTOR_TYPE_NAMES) {
|
||||
if (pair.second == str) {
|
||||
return pair.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// For images, buf.size() == nx*ny*3
|
||||
// Memory layout: RGBRGBRGB...
|
||||
// For audio, only one channel is used, buf.size() == nx*ny
|
||||
// nx will be n_frames and ny will be n_mel
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
struct clip_logger_state {
|
||||
ggml_log_level verbosity_thold;
|
||||
ggml_log_callback log_callback;
|
||||
void * log_callback_user_data;
|
||||
};
|
||||
|
||||
extern struct clip_logger_state g_logger_state;
|
||||
|
||||
static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
|
||||
if (format == NULL) {
|
||||
return;
|
||||
}
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
||||
} else {
|
||||
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
|
||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||
buffer2[len] = 0;
|
||||
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
||||
free(buffer2);
|
||||
}
|
||||
va_end(args_copy);
|
||||
}
|
||||
|
||||
static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
clip_log_internal_v(level, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
#define LOG_TMPL(level, ...) \
|
||||
do { \
|
||||
if ((level) >= g_logger_state.verbosity_thold) { \
|
||||
clip_log_internal((level), __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// cpp wrappers
|
||||
//
|
||||
|
||||
// wrapper for clip_image_size
|
||||
struct clip_image_size_deleter {
|
||||
void operator()(clip_image_size * val) { clip_image_size_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
||||
|
||||
// wrapper for clip_image_u8
|
||||
struct clip_image_u8_deleter {
|
||||
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
|
||||
|
||||
// wrapper for clip_image_f32
|
||||
struct clip_image_f32_deleter {
|
||||
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
std::vector<clip_image_u8_ptr> entries;
|
||||
};
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
std::vector<clip_image_f32_ptr> entries;
|
||||
bool is_audio = false;
|
||||
|
||||
// for llava-uhd style models, we need to know the grid size
|
||||
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
|
||||
int grid_x = 0;
|
||||
int grid_y = 0;
|
||||
|
||||
clip_image_f32_batch clone() const {
|
||||
clip_image_f32_batch new_batch{
|
||||
/* entries */ {},
|
||||
/* is_audio */ is_audio,
|
||||
/* grid_x */ grid_x,
|
||||
/* grid_y */ grid_y,
|
||||
};
|
||||
new_batch.entries.reserve(entries.size());
|
||||
for (const auto & entry : entries) {
|
||||
new_batch.entries.emplace_back(new clip_image_f32(*entry));
|
||||
}
|
||||
return new_batch;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// common utils
|
||||
//
|
||||
|
||||
static std::string string_format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), buf.size());
|
||||
}
|
||||
|
||||
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return;
|
||||
}
|
||||
std::string builder;
|
||||
builder.reserve(s.length());
|
||||
size_t pos = 0;
|
||||
size_t last_pos = 0;
|
||||
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||
builder.append(s, last_pos, pos - last_pos);
|
||||
builder.append(replace);
|
||||
last_pos = pos + search.length();
|
||||
}
|
||||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
// split string by a `std::string delim` instead of `char delim`
|
||||
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
while ((pos = s.find(delimiter)) != std::string::npos) {
|
||||
token = s.substr(0, pos);
|
||||
tokens.push_back(token);
|
||||
s.erase(0, pos + delimiter.length());
|
||||
}
|
||||
tokens.push_back(s);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
//
|
||||
// gguf utils
|
||||
//
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return string_format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
string_replace_all(val, "\\", "\\\\");
|
||||
string_replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// debugging
|
||||
//
|
||||
|
||||
static void print_tensor_shape(ggml_tensor * t) {
|
||||
printf("%s.shape = [", t->name);
|
||||
for (int i = 0; i < ggml_n_dims(t); ++i) {
|
||||
printf("%" PRId64, t->ne[i]);
|
||||
if (i < ggml_n_dims(t) - 1) {
|
||||
printf(", ");
|
||||
}
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
||||
ggml_type type = t->type;
|
||||
int64_t * ne = t->ne;
|
||||
size_t * nb = t->nb;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
printf("%s.data: [\n", t->name);
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
printf(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
printf(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
printf("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
printf("%8.4f", v);
|
||||
if (i0 < ne[0] - 1) printf(", ");
|
||||
}
|
||||
printf("],\n");
|
||||
}
|
||||
printf(" ],\n");
|
||||
}
|
||||
printf(" ]\n");
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
|
||||
projector_type clip_get_projector_type(const struct clip_ctx * ctx);
|
||||
4446
examples/mtmd/clip.cpp
Normal file
4446
examples/mtmd/clip.cpp
Normal file
File diff suppressed because it is too large
Load Diff
106
examples/mtmd/clip.h
Normal file
106
examples/mtmd/clip.h
Normal file
@@ -0,0 +1,106 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||
|
||||
int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||
|
||||
// TODO: should be enum, not string
|
||||
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
|
||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||
// for other models, X will be the total number of tokens and Y will be 1
|
||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// this should be equal to the embedding dimension of the text model
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_image_size * clip_image_size_init(void);
|
||||
struct clip_image_u8 * clip_image_u8_init (void);
|
||||
struct clip_image_f32 * clip_image_f32_init(void);
|
||||
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
||||
|
||||
// nx, ny are the output image dimensions
|
||||
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||
|
||||
void clip_image_size_free (struct clip_image_size * img_size);
|
||||
void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
// use for accessing underlay data of clip_image_f32_batch
|
||||
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
||||
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
// use by audio input
|
||||
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
|
||||
22
examples/mtmd/deprecation-warning.cpp
Normal file
22
examples/mtmd/deprecation-warning.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
size_t pos = filename.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
412
examples/mtmd/legacy-models/convert_image_encoder_to_gguf.py
Normal file
412
examples/mtmd/legacy-models/convert_image_encoder_to_gguf.py
Normal file
@@ -0,0 +1,412 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
|
||||
if name in (
|
||||
"logit_scale",
|
||||
"text_model.embeddings.position_ids",
|
||||
"vision_model.embeddings.position_ids",
|
||||
):
|
||||
return True
|
||||
|
||||
if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
|
||||
return True
|
||||
|
||||
if name.startswith("v") and not has_vision:
|
||||
return True
|
||||
|
||||
if name.startswith("t") and not has_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
# Standardize the transformers llava next keys for
|
||||
# image newline / mm projector with the classes in haotian-liu LLaVA
|
||||
if name == "image_newline":
|
||||
return "model.image_newline"
|
||||
if name.startswith("multi_modal_projector"):
|
||||
name = name.replace("multi_modal_projector", "mm")
|
||||
if "linear_1" in name:
|
||||
name = name.replace("linear_1", "0")
|
||||
if "linear_2" in name:
|
||||
name = name.replace("linear_2", "2")
|
||||
return name
|
||||
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
|
||||
# Selectable visual encoders that are compatible with this script
|
||||
encoder_group = ap.add_mutually_exclusive_group()
|
||||
encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
|
||||
help="the visual encoder is Siglip.")
|
||||
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
if args.text_only and args.vision_only:
|
||||
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if (
|
||||
args.clip_model_is_vision or
|
||||
not os.path.exists(dir_model + "/vocab.json") or
|
||||
args.clip_model_is_openclip or
|
||||
args.clip_model_is_siglip
|
||||
):
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
if args.clip_model_is_vision:
|
||||
v_hparams = config
|
||||
t_hparams = None
|
||||
else:
|
||||
v_hparams = config["vision_config"]
|
||||
t_hparams = config["text_config"]
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
#
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
if args.clip_model_is_siglip:
|
||||
model = SiglipVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
elif args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
else:
|
||||
model = CLIPModel.from_pretrained(dir_model)
|
||||
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = True
|
||||
has_vision_encoder = True
|
||||
has_llava_projector = False
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.llava_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_llava_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
fout.add_bool("clip.has_llava_projector", has_llava_projector)
|
||||
fout.add_file_type(ftype)
|
||||
model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
|
||||
fout.add_name(model_name)
|
||||
if args.text_only:
|
||||
fout.add_description("text-only CLIP model")
|
||||
elif args.vision_only and not has_llava_projector:
|
||||
fout.add_description("vision-only CLIP model")
|
||||
elif has_llava_projector:
|
||||
fout.add_description("image encoder for LLaVA")
|
||||
# add projector type
|
||||
fout.add_string("clip.projector_type", args.projector_type)
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_text_encoder:
|
||||
assert t_hparams is not None
|
||||
assert tokens is not None
|
||||
if args.clip_model_is_siglip:
|
||||
text_projection_dim = 0
|
||||
else:
|
||||
text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
|
||||
# text_model hparams
|
||||
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.text.projection_dim", text_projection_dim)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
||||
fout.add_token_list(tokens)
|
||||
|
||||
|
||||
|
||||
def get_non_negative_vision_feature_layers(v_hparams):
|
||||
"""
|
||||
Determine the vision feature layer(s) for the llava model, which are indices into the
|
||||
hidden states of the visual encoder. Note that the hidden states array generally takes the
|
||||
form:
|
||||
|
||||
[<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
|
||||
|
||||
so feature indices should be offset as n+1 to get the output of encoder block n.
|
||||
We convert all vision feature layers to non-negative so that -1 can be used in
|
||||
the model as an unset value. If no vision feature layer is found, we leave it unset.
|
||||
"""
|
||||
num_hidden_layers = v_hparams["num_hidden_layers"]
|
||||
to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
|
||||
feature_layers_key = None
|
||||
# Key used for llava models in transformers
|
||||
if "vision_feature_layer" in config:
|
||||
feature_layers_key = "vision_feature_layer"
|
||||
# Key used for llava models in the original format
|
||||
elif "mm_vision_select_layer" in config:
|
||||
feature_layers_key = "mm_vision_select_layer"
|
||||
if feature_layers_key is not None:
|
||||
feature_layers = config[feature_layers_key]
|
||||
if isinstance(feature_layers, int):
|
||||
feature_layers = [feature_layers]
|
||||
return [to_non_negative(feature_layer) for feature_layer in feature_layers]
|
||||
|
||||
# Determine if we have explicitly specified vision feature layers in our config
|
||||
feature_layers = get_non_negative_vision_feature_layers(v_hparams)
|
||||
|
||||
if has_vision_encoder:
|
||||
# Siglip does not have a visual projector; set projection dim to 0
|
||||
if args.clip_model_is_siglip:
|
||||
visual_projection_dim = 0
|
||||
else:
|
||||
visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
|
||||
|
||||
# set vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
||||
if feature_layers:
|
||||
block_count = max(feature_layers)
|
||||
else:
|
||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
# /**
|
||||
# "image_grid_pinpoints": [
|
||||
# [
|
||||
# 336,
|
||||
# 672
|
||||
# ],
|
||||
# [
|
||||
# 672,
|
||||
# 336
|
||||
# ],
|
||||
# [
|
||||
# 672,
|
||||
# 672
|
||||
# ],
|
||||
# [
|
||||
# 1008,
|
||||
# 336
|
||||
# ],
|
||||
# [
|
||||
# 336,
|
||||
# 1008
|
||||
# ]
|
||||
# ],
|
||||
# Flattened:
|
||||
# [
|
||||
# 336, 672,
|
||||
# 672, 336,
|
||||
# 672, 672,
|
||||
# 1008, 336,
|
||||
# 336, 1008
|
||||
# ]
|
||||
# *
|
||||
# */
|
||||
if "image_grid_pinpoints" in v_hparams:
|
||||
# flatten it
|
||||
image_grid_pinpoints = []
|
||||
for pinpoint in v_hparams["image_grid_pinpoints"]:
|
||||
for p in pinpoint:
|
||||
image_grid_pinpoints.append(p)
|
||||
fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
|
||||
if "image_crop_resolution" in v_hparams:
|
||||
fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
|
||||
if "image_aspect_ratio" in v_hparams:
|
||||
fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
|
||||
if "image_split_resolution" in v_hparams:
|
||||
fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
|
||||
if "mm_patch_merge_type" in v_hparams:
|
||||
fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
|
||||
if "mm_projector_type" in v_hparams:
|
||||
fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
|
||||
if feature_layers:
|
||||
fout.add_array("clip.vision.feature_layer", feature_layers)
|
||||
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
|
||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue]
|
||||
else:
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
use_gelu = v_hparams["hidden_act"] == "gelu"
|
||||
fout.add_bool("clip.use_gelu", use_gelu)
|
||||
|
||||
|
||||
if has_llava_projector:
|
||||
# By default, we drop the last layer for llava projector
|
||||
# models unless we have explicitly set vision feature layers
|
||||
if feature_layers is None:
|
||||
model.vision_model.encoder.layers.pop(-1)
|
||||
else:
|
||||
model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
|
||||
|
||||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
# pw and dw conv ndim==4
|
||||
if data.ndim == 2 or data.ndim == 4:
|
||||
data = data.squeeze().numpy().astype(np.float16)
|
||||
else:
|
||||
data = data.squeeze().numpy().astype(np.float32)
|
||||
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
print("Projector tensors added\n")
|
||||
|
||||
state_dict = model.state_dict()
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
@@ -0,0 +1,280 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
from transformers import SiglipVisionModel, SiglipVisionConfig
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
|
||||
if name in (
|
||||
"logit_scale",
|
||||
"text_model.embeddings.position_ids",
|
||||
"vision_model.embeddings.position_ids",
|
||||
):
|
||||
return True
|
||||
|
||||
if name in (
|
||||
"vision_model.head.probe",
|
||||
"vision_model.head.attention.in_proj_weight",
|
||||
"vision_model.head.attention.in_proj_bias",
|
||||
"vision_model.head.attention.out_proj.weight",
|
||||
"vision_model.head.attention.out_proj.bias",
|
||||
"vision_model.head.layernorm.weight",
|
||||
"vision_model.head.layernorm.bias",
|
||||
"vision_model.head.mlp.fc1.weight",
|
||||
"vision_model.head.mlp.fc1.bias",
|
||||
"vision_model.head.mlp.fc2.weight",
|
||||
"vision_model.head.mlp.fc2.bias"
|
||||
):
|
||||
return True
|
||||
|
||||
if name.startswith("v") and not has_vision:
|
||||
return True
|
||||
|
||||
if name.startswith("t") and not has_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.5, 0.5, 0.5]
|
||||
default_image_std = [0.5, 0.5, 0.5]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
if args.text_only and args.vision_only:
|
||||
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
if args.clip_model_is_vision:
|
||||
v_hparams = config
|
||||
t_hparams = None
|
||||
else:
|
||||
v_hparams = config["vision_config"]
|
||||
t_hparams = None
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
#
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
vision_config = SiglipVisionConfig(**v_hparams)
|
||||
model = SiglipVisionModel(vision_config)
|
||||
model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = False
|
||||
has_vision_encoder = True
|
||||
has_glm_projector = True
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.llava_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_glm_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
fout.add_bool("clip.has_glm_projector", has_glm_projector)
|
||||
fout.add_file_type(ftype)
|
||||
model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
|
||||
fout.add_name(model_name)
|
||||
if has_glm_projector:
|
||||
fout.add_description("image encoder for glm4v")
|
||||
fout.add_string("clip.projector_type", "adapter")
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_text_encoder:
|
||||
assert t_hparams is not None
|
||||
assert tokens is not None
|
||||
# text_model hparams
|
||||
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
||||
fout.add_token_list(tokens)
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
|
||||
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
fout.add_bool("clip.use_gelu", True)
|
||||
|
||||
|
||||
if has_glm_projector:
|
||||
# model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
|
||||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
# pw and dw conv ndim==4
|
||||
if data.ndim == 2 or data.ndim == 4:
|
||||
data = data.squeeze().numpy().astype(np.float16)
|
||||
else:
|
||||
data = data.squeeze().numpy().astype(np.float32)
|
||||
if name.startswith("vision."):
|
||||
name=name.replace("vision.","")
|
||||
fout.add_tensor(name, data)
|
||||
print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
|
||||
# print(f"Projector {name} tensors added\n")
|
||||
|
||||
state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
# print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
# print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
# print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
|
||||
# print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
33
examples/mtmd/legacy-models/glmedge-surgery.py
Normal file
33
examples/mtmd/legacy-models/glmedge-surgery.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", help="Path to GLM model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/glm.projector")
|
||||
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/glm.clip")
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
|
||||
38
examples/mtmd/legacy-models/llava_surgery.py
Normal file
38
examples/mtmd/legacy-models/llava_surgery.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import torch
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
|
||||
checkpoint = torch.load(path)
|
||||
|
||||
# get a list of mm tensor names
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/llava.projector")
|
||||
|
||||
# BakLLaVA models contain CLIP tensors in it
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/llava.clip")
|
||||
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||
180
examples/mtmd/legacy-models/llava_surgery_v2.py
Normal file
180
examples/mtmd/legacy-models/llava_surgery_v2.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
from typing import Any, ContextManager, cast
|
||||
|
||||
# Function to determine if file is a SafeTensor file
|
||||
def is_safetensor_file(file_path):
|
||||
return file_path.endswith('.safetensors')
|
||||
|
||||
|
||||
# Unified loading function
|
||||
def load_model(file_path):
|
||||
if is_safetensor_file(file_path):
|
||||
tensors = {}
|
||||
with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
|
||||
for key in f.keys():
|
||||
tensors[key] = f.get_tensor(key).clone()
|
||||
# output shape
|
||||
print(f"{key} : {tensors[key].shape}")
|
||||
return tensors, 'safetensor'
|
||||
else:
|
||||
return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
|
||||
|
||||
|
||||
# Unified saving function
|
||||
def save_model(model, file_path, file_type):
|
||||
if file_type == 'safetensor':
|
||||
# safe_save(model, file_path)
|
||||
save_file(model, file_path)
|
||||
else:
|
||||
torch.save(model, file_path)
|
||||
|
||||
# Helpers to match weight names from specific components or
|
||||
# determine if a saved shard contains that component
|
||||
def is_vision_tower(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.vision_tower") or
|
||||
weight_name.startswith("vit.") or
|
||||
weight_name.startswith("vision_tower")
|
||||
)
|
||||
|
||||
def is_newline(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.image_newline") or
|
||||
weight_name.startswith("image_newline")
|
||||
)
|
||||
|
||||
def is_mm_projector(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.mm_projector") or
|
||||
weight_name.startswith("vision_proj.") or
|
||||
weight_name.startswith("multi_modal_projector")
|
||||
)
|
||||
|
||||
def newline_criteria(checkpoint):
|
||||
return any(is_newline(k) for k in checkpoint.keys())
|
||||
|
||||
def proj_criteria(checkpoint):
|
||||
return any(is_mm_projector(k) for k in checkpoint.keys())
|
||||
|
||||
# Adapted function to clean vision tower from checkpoint
|
||||
def clean_vision_tower_from_checkpoint(checkpoint_path):
|
||||
checkpoint, file_type = load_model(checkpoint_path)
|
||||
# file_type = 'pytorch'
|
||||
model_path = os.path.dirname(checkpoint_path)
|
||||
print(f"Searching for vision tower tensors in {checkpoint_path}")
|
||||
clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
|
||||
|
||||
if len(clip_tensors) > 0:
|
||||
print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
|
||||
# Adapted for file type
|
||||
clip_path = os.path.join(model_path, "llava.clip")
|
||||
|
||||
if os.path.exists(clip_path):
|
||||
print(f"Loading existing llava.clip from {clip_path}")
|
||||
existing_clip, _ = load_model(clip_path)
|
||||
else:
|
||||
print(f"Creating new llava.clip at {clip_path}")
|
||||
existing_clip = {}
|
||||
# Update existing_clip with new tensors, avoid duplicates
|
||||
for name in clip_tensors:
|
||||
simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
|
||||
print(f"Adding {simple_name} to llava.clip")
|
||||
if simple_name not in existing_clip:
|
||||
existing_clip[simple_name] = checkpoint[name]
|
||||
|
||||
# Save the updated clip tensors back to llava.clip
|
||||
save_model(existing_clip, clip_path, 'pytorch')
|
||||
|
||||
# Remove the tensors from the original checkpoint
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
checkpoint_path = checkpoint_path
|
||||
return True
|
||||
return False
|
||||
|
||||
def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
|
||||
newline_checkpoint_path = None
|
||||
projector_checkpoint_path = None
|
||||
|
||||
for path in checkpoint_paths:
|
||||
checkpoint, _ = load_model(path)
|
||||
if newline_criteria(checkpoint) and newline_checkpoint_path is None:
|
||||
newline_checkpoint_path = path
|
||||
if projector(checkpoint):
|
||||
projector_checkpoint_path = path
|
||||
|
||||
return newline_checkpoint_path, projector_checkpoint_path
|
||||
|
||||
|
||||
# Command-line interface setup
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
|
||||
ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.clean_vision_tower:
|
||||
# Generalized to handle both PyTorch and SafeTensors models
|
||||
model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
|
||||
# checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
|
||||
checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
|
||||
for projector_checkpoint_path in checkpoint_paths:
|
||||
print(f"Cleaning {projector_checkpoint_path}")
|
||||
if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
|
||||
print(f"No vision tower found in {projector_checkpoint_path}")
|
||||
# we break once none is found, so far all models append them at the end
|
||||
# break
|
||||
print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
|
||||
|
||||
# Now we look for the projector in the last checkpoint
|
||||
model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
|
||||
checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
|
||||
# last_checkpoint_path = checkpoint_paths[0]
|
||||
# first_checkpoint_path = checkpoint_paths[-1]
|
||||
newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
|
||||
|
||||
print(f"Taking projector from {projector_checkpoint_path}")
|
||||
first_mm_tensors = []
|
||||
first_checkpoint = None
|
||||
if newline_checkpoint_path is not None:
|
||||
print(f"Taking newline from {newline_checkpoint_path}")
|
||||
first_checkpoint, file_type = load_model(newline_checkpoint_path)
|
||||
first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
|
||||
|
||||
# Load the checkpoint
|
||||
mm_tensors = []
|
||||
last_checkpoint = None
|
||||
if projector_checkpoint_path is not None:
|
||||
last_checkpoint, file_type = load_model(projector_checkpoint_path)
|
||||
mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
|
||||
|
||||
if len(mm_tensors) == 0:
|
||||
if last_checkpoint is not None:
|
||||
for k, v in last_checkpoint.items():
|
||||
print(k)
|
||||
print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
|
||||
print("No tensors found. Is this a LLaVA model?")
|
||||
exit()
|
||||
|
||||
print(f"Found {len(mm_tensors)} tensors to extract.")
|
||||
print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
|
||||
# projector = {name: checkpoint.[name].float() for name in mm_tensors}
|
||||
projector = {}
|
||||
for name in mm_tensors:
|
||||
assert last_checkpoint is not None
|
||||
projector[name] = last_checkpoint[name].float()
|
||||
for name in first_mm_tensors:
|
||||
assert first_checkpoint is not None
|
||||
projector[name] = first_checkpoint[name].float()
|
||||
|
||||
if len(projector) > 0:
|
||||
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||
@@ -0,0 +1,885 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch Siglip model. """
|
||||
# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
|
||||
|
||||
|
||||
import os
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.nn.init import _calculate_fan_in_and_fan_out
|
||||
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import (
|
||||
logging,
|
||||
)
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
class SiglipVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
|
||||
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
|
||||
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
Number of channels in the input images.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 16):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
|
||||
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
|
||||
>>> configuration = SiglipVisionConfig()
|
||||
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
|
||||
>>> model = SiglipVisionModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "siglip_vision_model"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=768,
|
||||
intermediate_size=3072,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
patch_size=16,
|
||||
hidden_act="gelu_pytorch_tanh",
|
||||
layer_norm_eps=1e-6,
|
||||
attention_dropout=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
|
||||
|
||||
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"google/siglip-base-patch16-224",
|
||||
# See all SigLIP models at https://huggingface.co/models?filter=siglip
|
||||
]
|
||||
|
||||
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
|
||||
def _get_unpad_data(attention_mask):
|
||||
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
|
||||
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
|
||||
max_seqlen_in_batch = seqlens_in_batch.max().item()
|
||||
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
|
||||
return (
|
||||
indices,
|
||||
cu_seqlens,
|
||||
max_seqlen_in_batch,
|
||||
)
|
||||
|
||||
|
||||
def _trunc_normal_(tensor, mean, std, a, b):
|
||||
# Cut & paste from PyTorch official master until it's in a few official releases - RW
|
||||
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
|
||||
def norm_cdf(x):
|
||||
# Computes standard normal cumulative distribution function
|
||||
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
|
||||
|
||||
if (mean < a - 2 * std) or (mean > b + 2 * std):
|
||||
warnings.warn(
|
||||
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
|
||||
"The distribution of values may be incorrect.",
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# Values are generated by using a truncated uniform distribution and
|
||||
# then using the inverse CDF for the normal distribution.
|
||||
# Get upper and lower cdf values
|
||||
l = norm_cdf((a - mean) / std)
|
||||
u = norm_cdf((b - mean) / std)
|
||||
|
||||
# Uniformly fill tensor with values from [l, u], then translate to
|
||||
# [2l-1, 2u-1].
|
||||
tensor.uniform_(2 * l - 1, 2 * u - 1)
|
||||
|
||||
# Use inverse cdf transform for normal distribution to get truncated
|
||||
# standard normal
|
||||
if tensor.dtype in [torch.float16, torch.bfloat16]:
|
||||
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
|
||||
og_dtype = tensor.dtype
|
||||
tensor = tensor.to(torch.float32)
|
||||
tensor.erfinv_()
|
||||
tensor = tensor.to(og_dtype)
|
||||
else:
|
||||
tensor.erfinv_()
|
||||
|
||||
# Transform to proper mean, std
|
||||
tensor.mul_(std * math.sqrt(2.0))
|
||||
tensor.add_(mean)
|
||||
|
||||
# Clamp to ensure it's in the proper range
|
||||
if tensor.dtype == torch.float16:
|
||||
# The `clamp_` op is not (yet?) defined in float16+cpu
|
||||
tensor = tensor.to(torch.float32)
|
||||
tensor.clamp_(min=a, max=b)
|
||||
tensor = tensor.to(torch.float16)
|
||||
else:
|
||||
tensor.clamp_(min=a, max=b)
|
||||
|
||||
|
||||
def trunc_normal_tf_(
|
||||
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
|
||||
):
|
||||
"""Fills the input Tensor with values drawn from a truncated
|
||||
normal distribution. The values are effectively drawn from the
|
||||
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
|
||||
with values outside :math:`[a, b]` redrawn until they are within
|
||||
the bounds. The method used for generating the random values works
|
||||
best when :math:`a \\leq \text{mean} \\leq b`.
|
||||
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
|
||||
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
|
||||
and the result is subsquently scaled and shifted by the mean and std args.
|
||||
Args:
|
||||
tensor: an n-dimensional `torch.Tensor`
|
||||
mean: the mean of the normal distribution
|
||||
std: the standard deviation of the normal distribution
|
||||
a: the minimum cutoff value
|
||||
b: the maximum cutoff value
|
||||
"""
|
||||
with torch.no_grad():
|
||||
_trunc_normal_(tensor, 0, 1.0, a, b)
|
||||
tensor.mul_(std).add_(mean)
|
||||
|
||||
|
||||
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
|
||||
denom = fan_in
|
||||
if mode == "fan_in":
|
||||
denom = fan_in
|
||||
elif mode == "fan_out":
|
||||
denom = fan_out
|
||||
elif mode == "fan_avg":
|
||||
denom = (fan_in + fan_out) / 2
|
||||
|
||||
variance = scale / denom
|
||||
|
||||
if distribution == "truncated_normal":
|
||||
# constant is stddev of standard normal truncated to (-2, 2)
|
||||
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
|
||||
elif distribution == "normal":
|
||||
with torch.no_grad():
|
||||
tensor.normal_(std=math.sqrt(variance))
|
||||
elif distribution == "uniform":
|
||||
bound = math.sqrt(3 * variance)
|
||||
with torch.no_grad():
|
||||
tensor.uniform_(-bound, bound)
|
||||
else:
|
||||
raise ValueError(f"invalid distribution {distribution}")
|
||||
|
||||
|
||||
def lecun_normal_(tensor):
|
||||
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
|
||||
|
||||
|
||||
def default_flax_embed_init(tensor):
|
||||
variance_scaling_(tensor, mode="fan_in", distribution="normal")
|
||||
|
||||
class SiglipVisionEmbeddings(nn.Module):
|
||||
def __init__(self, config: SiglipVisionConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.embed_dim = config.hidden_size
|
||||
self.image_size = config.image_size
|
||||
self.patch_size = config.patch_size
|
||||
|
||||
self.patch_embedding = nn.Conv2d(
|
||||
in_channels=config.num_channels,
|
||||
out_channels=self.embed_dim,
|
||||
kernel_size=self.patch_size,
|
||||
stride=self.patch_size,
|
||||
padding="valid",
|
||||
)
|
||||
|
||||
self.num_patches_per_side = self.image_size // self.patch_size
|
||||
self.num_patches = self.num_patches_per_side**2
|
||||
self.num_positions = self.num_patches
|
||||
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
||||
|
||||
class SiglipAttention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.embed_dim = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_dim = self.embed_dim // self.num_heads
|
||||
if self.head_dim * self.num_heads != self.embed_dim:
|
||||
raise ValueError(
|
||||
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
|
||||
f" {self.num_heads})."
|
||||
)
|
||||
self.scale = self.head_dim**-0.5
|
||||
self.dropout = config.attention_dropout
|
||||
|
||||
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
|
||||
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
|
||||
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
|
||||
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
|
||||
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
|
||||
class SiglipMLP(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.activation_fn = ACT2FN[config.hidden_act]
|
||||
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
|
||||
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
|
||||
class SiglipEncoderLayer(nn.Module):
|
||||
def __init__(self, config: SiglipVisionConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
||||
self.self_attn = (
|
||||
SiglipAttention(config)
|
||||
)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = SiglipMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
class SiglipPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = SiglipVisionConfig
|
||||
base_model_prefix = "siglip"
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
|
||||
if isinstance(module, SiglipVisionEmbeddings):
|
||||
width = self.config.hidden_size
|
||||
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
|
||||
elif isinstance(module, nn.Embedding):
|
||||
default_flax_embed_init(module.weight)
|
||||
elif isinstance(module, SiglipAttention):
|
||||
nn.init.normal_(module.q_proj.weight)
|
||||
nn.init.normal_(module.k_proj.weight)
|
||||
nn.init.normal_(module.v_proj.weight)
|
||||
nn.init.normal_(module.out_proj.weight)
|
||||
nn.init.zeros_(module.q_proj.bias)
|
||||
nn.init.zeros_(module.k_proj.bias)
|
||||
nn.init.zeros_(module.v_proj.bias)
|
||||
nn.init.zeros_(module.out_proj.bias)
|
||||
elif isinstance(module, SiglipMLP):
|
||||
nn.init.normal_(module.fc1.weight)
|
||||
nn.init.normal_(module.fc2.weight)
|
||||
nn.init.normal_(module.fc1.bias, std=1e-6)
|
||||
nn.init.normal_(module.fc2.bias, std=1e-6)
|
||||
elif isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||
lecun_normal_(module.weight)
|
||||
if module.bias is not None:
|
||||
nn.init.zeros_(module.bias)
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
SIGLIP_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
Parameters:
|
||||
config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
|
||||
class SiglipEncoder(nn.Module):
|
||||
"""
|
||||
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
|
||||
[`SiglipEncoderLayer`].
|
||||
Args:
|
||||
config: SiglipConfig
|
||||
"""
|
||||
|
||||
def __init__(self, config: SiglipVisionConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
class SiglipVisionTransformer(SiglipPreTrainedModel):
|
||||
config_class = SiglipVisionConfig
|
||||
main_input_name = "pixel_values"
|
||||
_supports_flash_attn_2 = True
|
||||
|
||||
def __init__(self, config: SiglipVisionConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = SiglipVisionEmbeddings(config)
|
||||
self.encoder = SiglipEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self) -> nn.Module:
|
||||
return self.embeddings.patch_embedding
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
|
||||
from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def add_key_str(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
|
||||
if name in (
|
||||
"logit_scale",
|
||||
"text_model.embeddings.position_ids",
|
||||
"vision_model.embeddings.position_ids",
|
||||
):
|
||||
return True
|
||||
|
||||
if has_minicpmv and name in ["visual_projection.weight"]:
|
||||
return True
|
||||
|
||||
if name.startswith("v") and not has_vision:
|
||||
return True
|
||||
|
||||
if name.startswith("t") and not has_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.5, 0.5, 0.5]
|
||||
default_image_std = [0.5, 0.5, 0.5]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
if args.text_only and args.vision_only:
|
||||
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
# Read config.json to get actual model configuration
|
||||
config_path = os.path.join(dir_model, "config.json")
|
||||
model_config = {}
|
||||
if os.path.isfile(config_path):
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
model_config = json.load(f)
|
||||
print(f"Loaded config from {config_path}")
|
||||
else:
|
||||
print(f"Warning: config.json not found at {config_path}")
|
||||
|
||||
# If minicpmv_projector is not specified but the default path exists, use the default path
|
||||
if args.minicpmv_projector is None:
|
||||
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
||||
if os.path.isfile(default_projector_path):
|
||||
args.minicpmv_projector = default_projector_path
|
||||
print(f"Found default projector file: {default_projector_path}")
|
||||
|
||||
# If output_dir is not specified, use model_dir as the default value
|
||||
if args.output_dir is None:
|
||||
args.output_dir = dir_model
|
||||
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
#
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
# if args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
# model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
# processor = None
|
||||
# else:
|
||||
# model = CLIPModel.from_pretrained(dir_model)
|
||||
# processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
|
||||
minicpmv_version = args.minicpmv_version
|
||||
|
||||
# Use actual config values instead of hardcoded ones
|
||||
if model_config:
|
||||
# For the projector/resampler, use the main model's hidden_size
|
||||
emb_dim = model_config.get("hidden_size", 1536)
|
||||
|
||||
# For the vision model, use vision_config values
|
||||
vision_config_dict = model_config.get("vision_config", {})
|
||||
default_vision_config = {
|
||||
"hidden_size": vision_config_dict.get("hidden_size", 1152),
|
||||
"image_size": vision_config_dict.get("image_size", 980),
|
||||
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
|
||||
"model_type": vision_config_dict.get("model_type", "siglip"),
|
||||
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
|
||||
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
|
||||
"patch_size": vision_config_dict.get("patch_size", 14),
|
||||
}
|
||||
|
||||
# Use vision model's num_hidden_layers for block_count
|
||||
block_count = vision_config_dict.get("num_hidden_layers", 27)
|
||||
|
||||
print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
|
||||
print(f"Vision config: {default_vision_config}")
|
||||
else:
|
||||
# Fallback to original hardcoded logic if config.json not found
|
||||
emb_dim = 4096
|
||||
block_count = 26
|
||||
if minicpmv_version == 1:
|
||||
emb_dim = 2304
|
||||
block_count = 26
|
||||
elif minicpmv_version == 2:
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
elif minicpmv_version == 3:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 4:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 5:
|
||||
emb_dim = 2560
|
||||
block_count = 27
|
||||
elif minicpmv_version == 6:
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
"image_size": 980,
|
||||
"intermediate_size": 4304,
|
||||
"model_type": "idefics2",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 27,
|
||||
"patch_size": 14,
|
||||
}
|
||||
|
||||
vision_config = Idefics2VisionConfig(**default_vision_config)
|
||||
model = Idefics2VisionTransformer(vision_config)
|
||||
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 4:
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 5:
|
||||
default_vision_config["model_type"] = "siglip_vision_model"
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 6:
|
||||
default_vision_config["model_type"] = "siglip_vision_model"
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
|
||||
processor = None
|
||||
# if model.attn_pool is not None:
|
||||
# model.attn_pool = torch.nn.Identity()
|
||||
|
||||
# model.blocks = model.blocks[:-1]
|
||||
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = True
|
||||
has_vision_encoder = True
|
||||
has_minicpmv_projector = False
|
||||
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.minicpmv_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_minicpmv_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
output_dir = args.output_dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
|
||||
fout.add_file_type(ftype)
|
||||
if args.text_only:
|
||||
fout.add_description("text-only CLIP model")
|
||||
elif args.vision_only and not has_minicpmv_projector:
|
||||
fout.add_description("vision-only CLIP model")
|
||||
elif has_minicpmv_projector:
|
||||
fout.add_description("image encoder for MiniCPM-V")
|
||||
# add projector type
|
||||
fout.add_string("clip.projector_type", "resampler")
|
||||
fout.add_int32("clip.minicpmv_version", minicpmv_version)
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams - use actual config values
|
||||
vision_image_size = model_config.get("image_size", 448) if model_config else 448
|
||||
vision_patch_size = default_vision_config.get("patch_size", 14)
|
||||
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
|
||||
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
|
||||
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
|
||||
|
||||
fout.add_uint32("clip.vision.image_size", vision_image_size)
|
||||
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
|
||||
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
|
||||
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
|
||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
# Add MiniCPM-V specific parameters
|
||||
query_num = model_config.get("query_num", 0) if model_config else 0
|
||||
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
|
||||
fout.add_uint32("clip.minicpmv_query_num", query_num)
|
||||
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||
else:
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
use_gelu = True
|
||||
fout.add_bool("clip.use_gelu", use_gelu)
|
||||
|
||||
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||
"""
|
||||
embed_dim: output dimension for each position
|
||||
pos: a list of positions to be encoded: size (M,)
|
||||
out: (M, D)
|
||||
"""
|
||||
assert embed_dim % 2 == 0
|
||||
omega = np.arange(embed_dim // 2, dtype=np.float32)
|
||||
omega /= embed_dim / 2.
|
||||
omega = 1. / 10000 ** omega # (D/2,)
|
||||
|
||||
pos = pos.reshape(-1) # (M,)
|
||||
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
||||
|
||||
emb_sin = np.sin(out) # (M, D/2)
|
||||
emb_cos = np.cos(out) # (M, D/2)
|
||||
|
||||
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
||||
return emb
|
||||
|
||||
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
|
||||
assert embed_dim % 2 == 0
|
||||
|
||||
# use half of dimensions to encode grid_h
|
||||
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
||||
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
||||
|
||||
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
||||
return emb
|
||||
|
||||
|
||||
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
|
||||
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
|
||||
"""
|
||||
grid_size: int of the grid height and width
|
||||
return:
|
||||
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
||||
"""
|
||||
if isinstance(grid_size, int):
|
||||
grid_h_size, grid_w_size = grid_size, grid_size
|
||||
else:
|
||||
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
|
||||
|
||||
grid_h = np.arange(grid_h_size, dtype=np.float32)
|
||||
grid_w = np.arange(grid_w_size, dtype=np.float32)
|
||||
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
||||
grid = np.stack(grid, axis=0)
|
||||
|
||||
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
|
||||
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
||||
if cls_token:
|
||||
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
|
||||
return pos_embed
|
||||
|
||||
def _replace_name_resampler(s, v):
|
||||
if re.match("resampler.pos_embed", s):
|
||||
return {
|
||||
s: v,
|
||||
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
|
||||
}
|
||||
if re.match("resampler.proj", s):
|
||||
return {
|
||||
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
|
||||
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
|
||||
}
|
||||
if re.match("resampler.attn.in_proj_.*", s):
|
||||
return {
|
||||
re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
|
||||
re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
|
||||
re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
|
||||
}
|
||||
return {s: v}
|
||||
|
||||
if has_minicpmv_projector:
|
||||
projector = torch.load(args.minicpmv_projector)
|
||||
new_state_dict = {}
|
||||
for k, v in projector.items():
|
||||
kvs = _replace_name_resampler(k, v)
|
||||
for nk, nv in kvs.items():
|
||||
new_state_dict[nk] = nv
|
||||
projector = new_state_dict
|
||||
ftype_cur = 0
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
if ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
fout.add_tensor(name, data)
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
|
||||
print("Projector tensors added\n")
|
||||
|
||||
def _replace_name(s, v):
|
||||
s = "vision_model." + s
|
||||
if re.match("vision_model.embeddings.position_embedding", s):
|
||||
v = v.unsqueeze(0)
|
||||
return {s: v}
|
||||
|
||||
return {s: v}
|
||||
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
kvs = _replace_name(k, v)
|
||||
for nk, nv in kvs.items():
|
||||
new_state_dict[nk] = nv
|
||||
state_dict = new_state_dict
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
|
||||
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
47
examples/mtmd/legacy-models/minicpmv-surgery.py
Normal file
47
examples/mtmd/legacy-models/minicpmv-surgery.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
|
||||
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
|
||||
torch.save(projector, f"{args.model}/minicpmv.projector")
|
||||
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/minicpmv.clip")
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
config = model.llm.config
|
||||
config.auto_map = {
|
||||
"AutoConfig": "configuration_minicpm.MiniCPMConfig",
|
||||
"AutoModel": "modeling_minicpm.MiniCPMModel",
|
||||
"AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||
"AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
|
||||
"AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
|
||||
}
|
||||
model.llm.save_pretrained(f"{args.model}/model")
|
||||
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
||||
tok.save_pretrained(f"{args.model}/model")
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
|
||||
769
examples/mtmd/mtmd-audio.cpp
Normal file
769
examples/mtmd/mtmd-audio.cpp
Normal file
@@ -0,0 +1,769 @@
|
||||
#include "mtmd-audio.h"
|
||||
|
||||
#define _USE_MATH_DEFINES // for M_PI
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
|
||||
// most of the code here is copied from whisper.cpp
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
namespace whisper_preprocessor {
|
||||
|
||||
#define SIN_COS_N_COUNT WHISPER_N_FFT
|
||||
namespace {
|
||||
struct whisper_global_cache {
|
||||
// In FFT, we frequently use sine and cosine operations with the same values.
|
||||
// We can use precalculated values to speed up the process.
|
||||
float sin_vals[SIN_COS_N_COUNT];
|
||||
float cos_vals[SIN_COS_N_COUNT];
|
||||
|
||||
// Hann window (Use cosf to eliminate difference)
|
||||
// ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
|
||||
float hann_window[WHISPER_N_FFT];
|
||||
|
||||
whisper_global_cache() {
|
||||
fill_sin_cos_table();
|
||||
fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
|
||||
}
|
||||
|
||||
void fill_sin_cos_table() {
|
||||
for (int i = 0; i < SIN_COS_N_COUNT; i++) {
|
||||
double theta = (2 * M_PI * i) / SIN_COS_N_COUNT;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
void fill_hann_window(int length, bool periodic, float * output) {
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
} global_cache;
|
||||
}
|
||||
|
||||
// naive Discrete Fourier Transform
|
||||
// input is real-valued
|
||||
// output is complex-valued
|
||||
static void dft(const float* in, int N, float* out) {
|
||||
const int sin_cos_step = SIN_COS_N_COUNT / N;
|
||||
|
||||
for (int k = 0; k < N; k++) {
|
||||
float re = 0;
|
||||
float im = 0;
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
|
||||
re += in[n]*global_cache.cos_vals[idx]; // cos(t)
|
||||
im -= in[n]*global_cache.sin_vals[idx]; // sin(t)
|
||||
}
|
||||
|
||||
out[k*2 + 0] = re;
|
||||
out[k*2 + 1] = im;
|
||||
}
|
||||
}
|
||||
|
||||
// Cooley-Tukey FFT
|
||||
// poor man's implementation - use something better
|
||||
// input is real-valued
|
||||
// output is complex-valued
|
||||
static void fft(float* in, int N, float* out) {
|
||||
if (N == 1) {
|
||||
out[0] = in[0];
|
||||
out[1] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
const int half_N = N / 2;
|
||||
if (N - half_N*2 == 1) {
|
||||
dft(in, N, out);
|
||||
return;
|
||||
}
|
||||
|
||||
float* even = in + N;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i]= in[2*i];
|
||||
}
|
||||
float* even_fft = out + 2 * N;
|
||||
fft(even, half_N, even_fft);
|
||||
|
||||
float* odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i] = in[2*i + 1];
|
||||
}
|
||||
float* odd_fft = even_fft + N;
|
||||
fft(odd, half_N, odd_fft);
|
||||
|
||||
const int sin_cos_step = SIN_COS_N_COUNT / N;
|
||||
for (int k = 0; k < half_N; k++) {
|
||||
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
|
||||
float re = global_cache.cos_vals[idx]; // cos(t)
|
||||
float im = -global_cache.sin_vals[idx]; // sin(t)
|
||||
|
||||
float re_odd = odd_fft[2*k + 0];
|
||||
float im_odd = odd_fft[2*k + 1];
|
||||
|
||||
out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
|
||||
out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
|
||||
|
||||
out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
|
||||
out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
|
||||
}
|
||||
}
|
||||
|
||||
static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
|
||||
int n_samples, int frame_size, int frame_step, int n_threads,
|
||||
const whisper_filters & filters, whisper_mel & mel) {
|
||||
std::vector<float> fft_in(frame_size * 2, 0.0);
|
||||
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
|
||||
|
||||
int n_fft = filters.n_fft;
|
||||
int i = ith;
|
||||
|
||||
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
|
||||
WHISPER_ASSERT(n_fft == 1 + (frame_size / 2));
|
||||
|
||||
// calculate FFT only when fft_in are not all zero
|
||||
for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
|
||||
const int offset = i * frame_step;
|
||||
|
||||
// apply Hann window (~10% faster)
|
||||
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
|
||||
fft_in[j] = hann[j] * samples[offset + j];
|
||||
}
|
||||
|
||||
// fill the rest with zeros
|
||||
if (n_samples - offset < frame_size) {
|
||||
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
|
||||
}
|
||||
|
||||
// FFT
|
||||
fft(fft_in.data(), frame_size, fft_out.data());
|
||||
|
||||
// Calculate modulus^2 of complex numbers
|
||||
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int j = 0; j < mel.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
// unroll loop (suggested by GH user @lunixbochs)
|
||||
int k = 0;
|
||||
for (k = 0; k < n_fft - 3; k += 4) {
|
||||
sum +=
|
||||
fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
|
||||
fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
|
||||
fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
|
||||
fft_out[k + 3] * filters.data[j * n_fft + k + 3];
|
||||
}
|
||||
// handle n_fft remainder
|
||||
for (; k < n_fft; k++) {
|
||||
sum += fft_out[k] * filters.data[j * n_fft + k];
|
||||
}
|
||||
sum = log10(std::max(sum, 1e-10));
|
||||
mel.data[j * mel.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise fft_out are all zero
|
||||
double sum = log10(1e-10);
|
||||
for (; i < mel.n_len; i += n_threads) {
|
||||
for (int j = 0; j < mel.n_mel; j++) {
|
||||
mel.data[j * mel.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
|
||||
static bool log_mel_spectrogram(
|
||||
const float * samples,
|
||||
const int n_samples,
|
||||
const int /*sample_rate*/,
|
||||
const int frame_size,
|
||||
const int frame_step,
|
||||
const int n_mel,
|
||||
const int n_threads,
|
||||
const whisper_filters & filters,
|
||||
const bool debug,
|
||||
whisper_mel & mel) {
|
||||
//const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
// Hann window
|
||||
WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
|
||||
const float * hann = global_cache.hann_window;
|
||||
|
||||
// Calculate the length of padding
|
||||
int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
|
||||
int64_t stage_2_pad = frame_size / 2;
|
||||
|
||||
// Initialize a vector and copy data from C array to it.
|
||||
std::vector<float> samples_padded;
|
||||
samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
|
||||
std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
|
||||
|
||||
// pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
|
||||
std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
|
||||
|
||||
// reflective pad 200 samples at the beginning of audio
|
||||
std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
|
||||
|
||||
mel.n_mel = n_mel;
|
||||
// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
|
||||
// Calculate number of frames + remove the last frame
|
||||
mel.n_len = (samples_padded.size() - frame_size) / frame_step;
|
||||
// Calculate semi-padded sample length to ensure compatibility
|
||||
mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
|
||||
mel.data.resize(mel.n_mel * mel.n_len);
|
||||
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw] = std::thread(
|
||||
log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
|
||||
n_samples + stage_2_pad, frame_size, frame_step, n_threads,
|
||||
std::cref(filters), std::ref(mel));
|
||||
}
|
||||
|
||||
// main thread
|
||||
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
|
||||
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw].join();
|
||||
}
|
||||
}
|
||||
|
||||
// clamping and normalization
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
|
||||
if (mel.data[i] > mmax) {
|
||||
mmax = mel.data[i];
|
||||
}
|
||||
}
|
||||
|
||||
mmax -= 8.0;
|
||||
|
||||
for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
|
||||
if (mel.data[i] < mmax) {
|
||||
mel.data[i] = mmax;
|
||||
}
|
||||
|
||||
mel.data[i] = (mel.data[i] + 4.0)/4.0;
|
||||
}
|
||||
|
||||
// Dump log_mel_spectrogram
|
||||
if (debug) {
|
||||
std::ofstream outFile("log_mel_spectrogram.json");
|
||||
outFile << "[";
|
||||
for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
|
||||
outFile << mel.data[i] << ", ";
|
||||
}
|
||||
outFile << mel.data[mel.data.size() - 1] << "]";
|
||||
outFile.close();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool preprocess_audio(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
const whisper_filters & filters,
|
||||
std::vector<whisper_mel> & output) {
|
||||
|
||||
if (n_samples == 0) {
|
||||
// empty audio
|
||||
return false;
|
||||
}
|
||||
|
||||
whisper_mel out_full;
|
||||
bool ok = log_mel_spectrogram(
|
||||
samples,
|
||||
n_samples,
|
||||
COMMON_SAMPLE_RATE,
|
||||
WHISPER_N_FFT,
|
||||
WHISPER_HOP_LENGTH,
|
||||
filters.n_mel,
|
||||
4, // n_threads
|
||||
filters,
|
||||
false, // debug
|
||||
out_full);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
|
||||
// we always expect the mel to have 3000 silent frames at the end
|
||||
// printf("n_len %d\n", out_full.n_len);
|
||||
const size_t frames_per_chunk = 3000;
|
||||
GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
|
||||
if ((size_t)n_len < frames_per_chunk) {
|
||||
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
|
||||
}
|
||||
|
||||
whisper_mel out_chunk;
|
||||
out_chunk.n_len = n_len;
|
||||
out_chunk.n_mel = out_full.n_mel;
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
|
||||
|
||||
for (int i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + i*out_full.n_len + off;
|
||||
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
|
||||
}
|
||||
|
||||
output.push_back(std::move(out_chunk));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
|
||||
// precalculated mel filter banks
|
||||
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
||||
//
|
||||
// generated from python code:
|
||||
//
|
||||
// from numpy import load
|
||||
// data = load('mel_filters.npz')
|
||||
// lst = data.files
|
||||
// for item in lst:
|
||||
// print(item)
|
||||
// print(data[item].shape)
|
||||
// n_mel = data[item].shape[0]
|
||||
// n_fft = data[item].shape[1]
|
||||
// for i, row in enumerate(data[item]):
|
||||
// for j, val in enumerate(row):
|
||||
// val = val * 1000.0
|
||||
// if val != 0:
|
||||
// print(f"data[{i*n_fft + j}] = {val:.6f};")
|
||||
|
||||
namespace whisper_precalc_filters {
|
||||
|
||||
whisper_preprocessor::whisper_filters get_128_bins() {
|
||||
whisper_preprocessor::whisper_filters filters;
|
||||
filters.n_mel = 128;
|
||||
filters.n_fft = 201;
|
||||
std::vector data(filters.n_mel * filters.n_fft, 0.0f);
|
||||
|
||||
data[1] = 12.37398665;
|
||||
data[202] = 30.39256483;
|
||||
data[404] = 24.74797331;
|
||||
data[605] = 18.01857911;
|
||||
data[807] = 37.12195903;
|
||||
data[1008] = 5.64459199;
|
||||
data[1009] = 6.72939420;
|
||||
data[1210] = 36.03715822;
|
||||
data[1412] = 19.10337992;
|
||||
data[1613] = 23.66316877;
|
||||
data[1815] = 31.47736564;
|
||||
data[2016] = 11.28918398;
|
||||
data[2017] = 1.08480197;
|
||||
data[2218] = 41.68175161;
|
||||
data[2420] = 13.45878839;
|
||||
data[2621] = 29.30776216;
|
||||
data[2823] = 25.83277412;
|
||||
data[3024] = 16.93377644;
|
||||
data[3226] = 38.20675984;
|
||||
data[3427] = 4.55979025;
|
||||
data[3428] = 7.81419594;
|
||||
data[3629] = 34.95235741;
|
||||
data[3831] = 20.18818259;
|
||||
data[4032] = 22.57836796;
|
||||
data[4234] = 32.56217018;
|
||||
data[4435] = 10.20438317;
|
||||
data[4436] = 2.16960395;
|
||||
data[4637] = 40.59694707;
|
||||
data[4839] = 14.54358920;
|
||||
data[5040] = 28.22295949;
|
||||
data[5242] = 26.91757679;
|
||||
data[5443] = 15.84897563;
|
||||
data[5645] = 39.29156065;
|
||||
data[5846] = 3.47498828;
|
||||
data[5847] = 8.89899861;
|
||||
data[6048] = 33.86755288;
|
||||
data[6250] = 21.27298526;
|
||||
data[6451] = 21.49356715;
|
||||
data[6653] = 33.64697099;
|
||||
data[6854] = 9.11958050;
|
||||
data[6855] = 3.25440569;
|
||||
data[7056] = 39.51214626;
|
||||
data[7258] = 15.62839188;
|
||||
data[7459] = 27.13815868;
|
||||
data[7661] = 28.00237760;
|
||||
data[7862] = 14.76417296;
|
||||
data[8064] = 40.37636518;
|
||||
data[8265] = 2.38068704;
|
||||
data[8266] = 10.20263787;
|
||||
data[8467] = 31.61146119;
|
||||
data[8669] = 24.54700135;
|
||||
data[8870] = 15.32919332;
|
||||
data[8871] = 1.66583748;
|
||||
data[9072] = 36.72905266;
|
||||
data[9274] = 20.09709924;
|
||||
data[9475] = 16.93102531;
|
||||
data[9476] = 2.90265540;
|
||||
data[9677] = 32.84499049;
|
||||
data[9879] = 23.52004871;
|
||||
data[10080] = 11.03894413;
|
||||
data[10081] = 10.72582975;
|
||||
data[10282] = 22.71829173;
|
||||
data[10484] = 32.27872774;
|
||||
data[10685] = 0.11626833;
|
||||
data[10686] = 22.85348251;
|
||||
data[10887] = 8.56344029;
|
||||
data[10888] = 14.97978810;
|
||||
data[11089] = 15.51398356;
|
||||
data[11090] = 8.51490628;
|
||||
data[11291] = 21.10680379;
|
||||
data[11292] = 3.32652032;
|
||||
data[11493] = 25.47064796;
|
||||
data[11695] = 27.35907957;
|
||||
data[11896] = 0.65853616;
|
||||
data[11897] = 23.83812517;
|
||||
data[12098] = 3.44359246;
|
||||
data[12099] = 21.22455277;
|
||||
data[12300] = 5.35842171;
|
||||
data[12301] = 19.42555793;
|
||||
data[12502] = 6.49324711;
|
||||
data[12503] = 18.35542172;
|
||||
data[12704] = 6.93138083;
|
||||
data[12705] = 17.93504693;
|
||||
data[12906] = 6.74968259;
|
||||
data[12907] = 18.09151843;
|
||||
data[13108] = 6.01899112;
|
||||
data[13109] = 18.75767298;
|
||||
data[13310] = 4.80452832;
|
||||
data[13311] = 19.87172849;
|
||||
data[13512] = 3.16627859;
|
||||
data[13513] = 21.37690969;
|
||||
data[13514] = 1.25317345;
|
||||
data[13714] = 1.15934468;
|
||||
data[13715] = 20.80361731;
|
||||
data[13716] = 4.04486805;
|
||||
data[13917] = 17.55363122;
|
||||
data[13918] = 7.08320038;
|
||||
data[14119] = 14.07538634;
|
||||
data[14120] = 10.32655034;
|
||||
data[14321] = 10.40921453;
|
||||
data[14322] = 13.73696327;
|
||||
data[14523] = 6.59187697;
|
||||
data[14524] = 17.27988198;
|
||||
data[14525] = 1.46804214;
|
||||
data[14725] = 2.65681883;
|
||||
data[14726] = 18.09193194;
|
||||
data[14727] = 5.85655728;
|
||||
data[14928] = 13.34277913;
|
||||
data[14929] = 10.28267574;
|
||||
data[15130] = 8.56800377;
|
||||
data[15131] = 14.72230814;
|
||||
data[15132] = 1.04039861;
|
||||
data[15332] = 3.79085587;
|
||||
data[15333] = 17.14678481;
|
||||
data[15334] = 6.11609267;
|
||||
data[15535] = 11.75929047;
|
||||
data[15536] = 11.13393717;
|
||||
data[15737] = 6.43857848;
|
||||
data[15738] = 16.07806236;
|
||||
data[15739] = 4.23917221;
|
||||
data[15939] = 1.19989377;
|
||||
data[15940] = 12.75671553;
|
||||
data[15941] = 9.65298992;
|
||||
data[16142] = 7.06935255;
|
||||
data[16143] = 14.94054683;
|
||||
data[16144] = 4.19024844;
|
||||
data[16344] = 1.51483389;
|
||||
data[16345] = 12.00899947;
|
||||
data[16346] = 9.84823331;
|
||||
data[16547] = 6.10224018;
|
||||
data[16548] = 15.33857174;
|
||||
data[16549] = 5.57676842;
|
||||
data[16749] = 0.36827257;
|
||||
data[16750] = 9.89749376;
|
||||
data[16751] = 11.35340426;
|
||||
data[16752] = 2.05122307;
|
||||
data[16952] = 3.89297144;
|
||||
data[16953] = 12.97352277;
|
||||
data[16954] = 8.06631614;
|
||||
data[17155] = 6.74493238;
|
||||
data[17156] = 13.85874674;
|
||||
data[17157] = 5.41190524;
|
||||
data[17357] = 0.74220158;
|
||||
data[17358] = 8.98779090;
|
||||
data[17359] = 11.37871388;
|
||||
data[17360] = 3.32958088;
|
||||
data[17560] = 2.82313535;
|
||||
data[17561] = 10.68049297;
|
||||
data[17562] = 9.43340641;
|
||||
data[17563] = 1.76325557;
|
||||
data[17763] = 4.39018616;
|
||||
data[17764] = 11.87758986;
|
||||
data[17765] = 7.97005836;
|
||||
data[17766] = 0.66104700;
|
||||
data[17966] = 5.49466675;
|
||||
data[17967] = 12.62953598;
|
||||
data[17968] = 6.93987962;
|
||||
data[18169] = 6.18401915;
|
||||
data[18170] = 12.93473132;
|
||||
data[18171] = 6.29778765;
|
||||
data[18371] = 0.02325210;
|
||||
data[18372] = 6.50206627;
|
||||
data[18373] = 12.32661773;
|
||||
data[18374] = 6.00216538;
|
||||
data[18574] = 0.31548753;
|
||||
data[18575] = 6.48925547;
|
||||
data[18576] = 12.04130240;
|
||||
data[18577] = 6.01462880;
|
||||
data[18777] = 0.29979556;
|
||||
data[18778] = 6.18288014;
|
||||
data[18779] = 12.04272825;
|
||||
data[18780] = 6.29981188;
|
||||
data[18781] = 0.55689598;
|
||||
data[18980] = 0.01120471;
|
||||
data[18981] = 5.61729167;
|
||||
data[18982] = 11.22337859;
|
||||
data[18983] = 6.82516303;
|
||||
data[18984] = 1.35264499;
|
||||
data[19184] = 4.82410006;
|
||||
data[19185] = 10.16623247;
|
||||
data[19186] = 7.56075513;
|
||||
data[19187] = 2.34590308;
|
||||
data[19387] = 3.83235747;
|
||||
data[19388] = 8.92296247;
|
||||
data[19389] = 8.47910438;
|
||||
data[19390] = 3.50978645;
|
||||
data[19590] = 2.66873185;
|
||||
data[19591] = 7.51965167;
|
||||
data[19592] = 9.55500547;
|
||||
data[19593] = 4.81966138;
|
||||
data[19594] = 0.08431751;
|
||||
data[19793] = 1.35767367;
|
||||
data[19794] = 5.98019501;
|
||||
data[19795] = 10.60271543;
|
||||
data[19796] = 6.25298498;
|
||||
data[19797] = 1.74059917;
|
||||
data[19997] = 4.32644226;
|
||||
data[19998] = 8.73131864;
|
||||
data[19999] = 7.78916525;
|
||||
data[20000] = 3.48923868;
|
||||
data[20200] = 2.57835095;
|
||||
data[20201] = 6.77582854;
|
||||
data[20202] = 9.40941647;
|
||||
data[20203] = 5.31194592;
|
||||
data[20204] = 1.21447595;
|
||||
data[20403] = 0.75411191;
|
||||
data[20404] = 4.75395704;
|
||||
data[20405] = 8.75380263;
|
||||
data[20406] = 7.19209015;
|
||||
data[20407] = 3.28754401;
|
||||
data[20607] = 2.68179690;
|
||||
data[20608] = 6.49331464;
|
||||
data[20609] = 9.11457930;
|
||||
data[20610] = 5.39387390;
|
||||
data[20611] = 1.67316827;
|
||||
data[20810] = 0.57394296;
|
||||
data[20811] = 4.20600036;
|
||||
data[20812] = 7.83805829;
|
||||
data[20813] = 7.52023002;
|
||||
data[20814] = 3.97470826;
|
||||
data[20815] = 0.42918732;
|
||||
data[21014] = 1.90464477;
|
||||
data[21015] = 5.36569161;
|
||||
data[21016] = 8.82673822;
|
||||
data[21017] = 6.27609482;
|
||||
data[21018] = 2.89750961;
|
||||
data[21218] = 2.89885257;
|
||||
data[21219] = 6.19694078;
|
||||
data[21220] = 8.56699049;
|
||||
data[21221] = 5.34748193;
|
||||
data[21222] = 2.12797290;
|
||||
data[21421] = 0.44750227;
|
||||
data[21422] = 3.59030394;
|
||||
data[21423] = 6.73310598;
|
||||
data[21424] = 7.77023612;
|
||||
data[21425] = 4.70231380;
|
||||
data[21426] = 1.63439126;
|
||||
data[21625] = 1.01536023;
|
||||
data[21626] = 4.01018746;
|
||||
data[21627] = 7.00501446;
|
||||
data[21628] = 7.23442994;
|
||||
data[21629] = 4.31095669;
|
||||
data[21630] = 1.38748321;
|
||||
data[21829] = 1.33348850;
|
||||
data[21830] = 4.18730825;
|
||||
data[21831] = 7.04112789;
|
||||
data[21832] = 6.93188375;
|
||||
data[21833] = 4.14605811;
|
||||
data[21834] = 1.36023236;
|
||||
data[22033] = 1.42879714;
|
||||
data[22034] = 4.14824858;
|
||||
data[22035] = 6.86769979;
|
||||
data[22036] = 6.83705276;
|
||||
data[22037] = 4.18239459;
|
||||
data[22038] = 1.52773573;
|
||||
data[22237] = 1.32610439;
|
||||
data[22238] = 3.91751388;
|
||||
data[22239] = 6.50892360;
|
||||
data[22240] = 6.92639686;
|
||||
data[22241] = 4.39672917;
|
||||
data[22242] = 1.86706171;
|
||||
data[22441] = 1.04827771;
|
||||
data[22442] = 3.51767405;
|
||||
data[22443] = 5.98707050;
|
||||
data[22444] = 7.17824046;
|
||||
data[22445] = 4.76767914;
|
||||
data[22446] = 2.35711760;
|
||||
data[22645] = 0.61636406;
|
||||
data[22646] = 2.96949223;
|
||||
data[22647] = 5.32262027;
|
||||
data[22648] = 7.57265091;
|
||||
data[22649] = 5.27558755;
|
||||
data[22650] = 2.97852419;
|
||||
data[22651] = 0.68146095;
|
||||
data[22849] = 0.04971400;
|
||||
data[22850] = 2.29204819;
|
||||
data[22851] = 4.53438237;
|
||||
data[22852] = 6.77671656;
|
||||
data[22853] = 5.90240723;
|
||||
data[22854] = 3.71349836;
|
||||
data[22855] = 1.52458926;
|
||||
data[23054] = 1.50285335;
|
||||
data[23055] = 3.63961048;
|
||||
data[23056] = 5.77636715;
|
||||
data[23057] = 6.63159089;
|
||||
data[23058] = 4.54574358;
|
||||
data[23059] = 2.45989650;
|
||||
data[23060] = 0.37404924;
|
||||
data[23258] = 0.61795861;
|
||||
data[23259] = 2.65410915;
|
||||
data[23260] = 4.69025923;
|
||||
data[23261] = 6.72641024;
|
||||
data[23262] = 5.46034705;
|
||||
data[23263] = 3.47270933;
|
||||
data[23264] = 1.48507138;
|
||||
data[23463] = 1.59233576;
|
||||
data[23464] = 3.53261665;
|
||||
data[23465] = 5.47289755;
|
||||
data[23466] = 6.44368259;
|
||||
data[23467] = 4.54962999;
|
||||
data[23468] = 2.65557761;
|
||||
data[23469] = 0.76152512;
|
||||
data[23667] = 0.46749352;
|
||||
data[23668] = 2.31641904;
|
||||
data[23669] = 4.16534441;
|
||||
data[23670] = 6.01426978;
|
||||
data[23671] = 5.67844696;
|
||||
data[23672] = 3.87357362;
|
||||
data[23673] = 2.06870004;
|
||||
data[23674] = 0.26382666;
|
||||
data[23872] = 1.05349103;
|
||||
data[23873] = 2.81536230;
|
||||
data[23874] = 4.57723346;
|
||||
data[23875] = 6.33910485;
|
||||
data[23876] = 5.12815686;
|
||||
data[23877] = 3.40826320;
|
||||
data[23878] = 1.68837002;
|
||||
data[24077] = 1.43350090;
|
||||
data[24078] = 3.11241671;
|
||||
data[24079] = 4.79133241;
|
||||
data[24080] = 6.40943693;
|
||||
data[24081] = 4.77052201;
|
||||
data[24082] = 3.13160778;
|
||||
data[24083] = 1.49269309;
|
||||
data[24281] = 0.02932359;
|
||||
data[24282] = 1.62918994;
|
||||
data[24283] = 3.22905602;
|
||||
data[24284] = 4.82892245;
|
||||
data[24285] = 6.14671456;
|
||||
data[24286] = 4.58496623;
|
||||
data[24287] = 3.02321767;
|
||||
data[24288] = 1.46146910;
|
||||
data[24486] = 0.13601698;
|
||||
data[24487] = 1.66055572;
|
||||
data[24488] = 3.18509457;
|
||||
data[24489] = 4.70963307;
|
||||
data[24490] = 6.04072399;
|
||||
data[24491] = 4.55250870;
|
||||
data[24492] = 3.06429295;
|
||||
data[24493] = 1.57607743;
|
||||
data[24494] = 0.08786193;
|
||||
data[24691] = 0.09328097;
|
||||
data[24692] = 1.54603878;
|
||||
data[24693] = 2.99879676;
|
||||
data[24694] = 4.45155473;
|
||||
data[24695] = 5.90431225;
|
||||
data[24696] = 4.65566106;
|
||||
data[24697] = 3.23751615;
|
||||
data[24698] = 1.81937125;
|
||||
data[24699] = 0.40122634;
|
||||
data[24897] = 1.30262633;
|
||||
data[24898] = 2.68698297;
|
||||
data[24899] = 4.07133950;
|
||||
data[24900] = 5.45569602;
|
||||
data[24901] = 4.87832492;
|
||||
data[24902] = 3.52695142;
|
||||
data[24903] = 2.17557792;
|
||||
data[24904] = 0.82420459;
|
||||
data[25102] = 0.94595028;
|
||||
data[25103] = 2.26512621;
|
||||
data[25104] = 3.58430226;
|
||||
data[25105] = 4.90347855;
|
||||
data[25106] = 5.20569785;
|
||||
data[25107] = 3.91795207;
|
||||
data[25108] = 2.63020652;
|
||||
data[25109] = 1.34246063;
|
||||
data[25110] = 0.05471494;
|
||||
data[25307] = 0.49037894;
|
||||
data[25308] = 1.74744334;
|
||||
data[25309] = 3.00450763;
|
||||
data[25310] = 4.26157191;
|
||||
data[25311] = 5.51863620;
|
||||
data[25312] = 4.39707236;
|
||||
data[25313] = 3.16995848;
|
||||
data[25314] = 1.94284460;
|
||||
data[25315] = 0.71573065;
|
||||
data[25513] = 1.14698056;
|
||||
data[25514] = 2.34485767;
|
||||
data[25515] = 3.54273478;
|
||||
data[25516] = 4.74061165;
|
||||
data[25517] = 4.95198462;
|
||||
data[25518] = 3.78264743;
|
||||
data[25519] = 2.61331047;
|
||||
data[25520] = 1.44397374;
|
||||
data[25521] = 0.27463681;
|
||||
data[25718] = 0.47569509;
|
||||
data[25719] = 1.61717169;
|
||||
data[25720] = 2.75864848;
|
||||
data[25721] = 3.90012516;
|
||||
data[25722] = 5.04160160;
|
||||
data[25723] = 4.45712078;
|
||||
data[25724] = 3.34284059;
|
||||
data[25725] = 2.22856039;
|
||||
data[25726] = 1.11428020;
|
||||
|
||||
for (auto & val : data) {
|
||||
val /= 1000.0f;
|
||||
}
|
||||
|
||||
filters.data = std::move(data);
|
||||
return filters;
|
||||
}
|
||||
|
||||
} // namespace whisper_precalc_filters
|
||||
47
examples/mtmd/mtmd-audio.h
Normal file
47
examples/mtmd/mtmd-audio.h
Normal file
@@ -0,0 +1,47 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define WHISPER_ASSERT GGML_ASSERT
|
||||
|
||||
#define WHISPER_SAMPLE_RATE 16000
|
||||
#define WHISPER_N_FFT 400
|
||||
#define WHISPER_HOP_LENGTH 160
|
||||
#define WHISPER_CHUNK_SIZE 30
|
||||
|
||||
#define COMMON_SAMPLE_RATE 16000
|
||||
|
||||
namespace whisper_preprocessor {
|
||||
|
||||
struct whisper_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct whisper_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
bool preprocess_audio(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
const whisper_filters & filters,
|
||||
std::vector<whisper_mel> & output);
|
||||
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
namespace whisper_precalc_filters {
|
||||
|
||||
whisper_preprocessor::whisper_filters get_128_bins();
|
||||
|
||||
} // namespace whisper_precalc_filters
|
||||
445
examples/mtmd/mtmd-cli.cpp
Normal file
445
examples/mtmd/mtmd-cli.cpp
Normal file
@@ -0,0 +1,445 @@
|
||||
//#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "console.h"
|
||||
#include "chat.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
// volatile, because of signal being an interrupt
|
||||
static volatile bool g_is_generating = false;
|
||||
static volatile bool g_is_interrupted = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying multimodal support in llama.cpp.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE(
|
||||
"Experimental CLI for multimodal\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" -hf user/repo can replace both -m and --mmproj in most cases\n"
|
||||
" --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
|
||||
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (g_is_generating) {
|
||||
g_is_generating = false;
|
||||
} else {
|
||||
console::cleanup();
|
||||
if (g_is_interrupted) {
|
||||
_exit(1);
|
||||
}
|
||||
g_is_interrupted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// ======================= compat ================================
|
||||
using common_init_result = llama_init_result;
|
||||
using common_sampler = llama_sampling_context;
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
using common_params = gpt_params;
|
||||
|
||||
inline common_init_result common_init_from_params(gpt_params & params) {
|
||||
return llama_init_from_gpt_params(params);
|
||||
}
|
||||
inline llama_sampling_context * common_sampler_init(const llama_model * model, const llama_sampling_params & sparams) {
|
||||
return llama_sampling_init(llama_get_model_vocab(model), sparams);
|
||||
}
|
||||
inline std::vector<llama_token> common_tokenize(const llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false) {
|
||||
return llama_tokenize(ctx, text, add_special, parse_special);
|
||||
}
|
||||
inline void common_sampler_free(common_sampler * smpl) {
|
||||
llama_sampling_free(smpl);
|
||||
}
|
||||
inline llama_token common_sampler_sample(common_sampler * gsmpl, llama_context * ctx, int idx, [[maybe_unused]] bool grammar_first = false) {
|
||||
return llama_sampling_sample(gsmpl, ctx, nullptr, idx);
|
||||
}
|
||||
inline void common_sampler_accept(common_sampler * gsmpl, llama_context * ctx, llama_token token, bool accept_grammar) {
|
||||
llama_sampling_accept(gsmpl, ctx, token, accept_grammar);
|
||||
}
|
||||
inline std::string common_token_to_piece(const llama_context * ctx, llama_token token, bool special = true) {
|
||||
return llama_token_to_piece(ctx, token, special);
|
||||
}
|
||||
inline void common_batch_clear(llama_batch & batch) {
|
||||
llama_batch_clear(batch);
|
||||
}
|
||||
inline void common_batch_add(llama_batch & batch, llama_token id, llama_pos pos, const std::vector<llama_seq_id> & seq_ids, bool logits) {
|
||||
llama_batch_add(batch, id, pos, seq_ids, logits);
|
||||
}
|
||||
void common_init() {
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
#else
|
||||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
LOG_TEE("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
}
|
||||
|
||||
|
||||
#ifndef LOG_ERR
|
||||
#define LOG_ERR LOG_TEE
|
||||
#endif
|
||||
#ifndef LOG_INF
|
||||
#define LOG_INF LOG_TEE
|
||||
#endif
|
||||
#ifndef LOG_DBG
|
||||
#define LOG_DBG LOG
|
||||
#endif
|
||||
// ======================= end compat ================================
|
||||
|
||||
struct mtmd_cli_context {
|
||||
mtmd::context_ptr ctx_vision;
|
||||
common_init_result llama_init;
|
||||
|
||||
llama_model * model;
|
||||
llama_context * lctx;
|
||||
const llama_vocab * vocab;
|
||||
common_sampler * smpl;
|
||||
llama_batch batch;
|
||||
int n_batch;
|
||||
|
||||
mtmd::bitmaps bitmaps;
|
||||
|
||||
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
|
||||
// so here we don't need to keep track of chat history
|
||||
common_chat_templates_ptr tmpls;
|
||||
|
||||
// support for legacy templates (models not having EOT token)
|
||||
llama_tokens antiprompt_tokens;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init.model; //.get();
|
||||
lctx = llama_init.context; //.get();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
smpl = common_sampler_init(model, params.sparams); //sampling);
|
||||
n_threads = params.n_threads;
|
||||
batch = llama_batch_init(1, 0, 1); // batch for next token generation
|
||||
n_batch = params.n_batch;
|
||||
|
||||
if (!model || !lctx) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
|
||||
LOG_ERR("Model does not have chat template.\n");
|
||||
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
|
||||
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
|
||||
LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
tmpls = common_chat_templates_init(model, params.chat_template);
|
||||
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
|
||||
|
||||
init_vision_context(params);
|
||||
|
||||
// load antiprompt tokens for legacy templates
|
||||
if (params.chat_template == "vicuna") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
|
||||
} else if (params.chat_template == "deepseek") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "###", false, true);
|
||||
}
|
||||
}
|
||||
|
||||
~mtmd_cli_context() {
|
||||
llama_batch_free(batch);
|
||||
common_sampler_free(smpl);
|
||||
}
|
||||
|
||||
void init_vision_context(common_params & params) {
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = params.mmproj_use_gpu;
|
||||
mparams.print_timings = true;
|
||||
mparams.n_threads = params.n_threads;
|
||||
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
bool check_antiprompt(const llama_tokens & generated_tokens) {
|
||||
if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(
|
||||
generated_tokens.end() - antiprompt_tokens.size(),
|
||||
generated_tokens.end(),
|
||||
antiprompt_tokens.begin()
|
||||
);
|
||||
}
|
||||
|
||||
bool load_media(const std::string & fname) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||
if (!bmp.ptr) {
|
||||
return false;
|
||||
}
|
||||
bitmaps.entries.push_back(std::move(bmp));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
|
||||
llama_tokens generated_tokens;
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating || g_is_interrupted) {
|
||||
LOG_TEE("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
|
||||
generated_tokens.push_back(token_id);
|
||||
common_sampler_accept(ctx.smpl, ctx.lctx, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
|
||||
LOG_TEE("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
|
||||
LOG_TEE("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
if (g_is_interrupted) {
|
||||
LOG_TEE("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// eval the token
|
||||
common_batch_clear(ctx.batch);
|
||||
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("failed to decode token\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
|
||||
common_chat_templates_inputs tmpl_inputs;
|
||||
tmpl_inputs.messages = {msg};
|
||||
tmpl_inputs.add_generation_prompt = true;
|
||||
tmpl_inputs.use_jinja = false; // jinja is buggy here
|
||||
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
|
||||
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
|
||||
|
||||
mtmd_input_text text;
|
||||
text.text = formatted_chat.prompt.c_str();
|
||||
text.add_special = add_bos;
|
||||
text.parse_special = true;
|
||||
|
||||
if (g_is_interrupted) return 0;
|
||||
|
||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
|
||||
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
|
||||
chunks.ptr.get(), // output
|
||||
&text, // text
|
||||
bitmaps_c_ptr.data(),
|
||||
bitmaps_c_ptr.size());
|
||||
if (res != 0) {
|
||||
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.bitmaps.entries.clear();
|
||||
|
||||
llama_pos new_n_past;
|
||||
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
|
||||
ctx.lctx, // lctx
|
||||
chunks.ptr.get(), // chunks
|
||||
ctx.n_past, // n_past
|
||||
0, // seq_id
|
||||
ctx.n_batch, // n_batch
|
||||
true, // logits_last
|
||||
&new_n_past)) {
|
||||
LOG_ERR("Unable to eval prompt\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.n_past = new_n_past;
|
||||
|
||||
LOG("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
params.sparams.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
//if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
LOG_ERR("ERR: Missing --mmproj argument\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
mtmd_cli_context ctx(params);
|
||||
LOG("%s: loading model: %s\n", __func__, params.model.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
// Ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (g_is_interrupted) return 130;
|
||||
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = params.prompt;
|
||||
for (const auto & image : params.image) {
|
||||
if (!ctx.load_media(image)) {
|
||||
return 1; // error is already printed by libmtmd
|
||||
}
|
||||
}
|
||||
if (eval_message(ctx, msg, true)) {
|
||||
return 1;
|
||||
}
|
||||
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
LOG_TEE("\n Running in chat mode, available commands:");
|
||||
if (mtmd_support_vision(ctx.ctx_vision.get())) {
|
||||
LOG_TEE("\n /image <path> load an image");
|
||||
}
|
||||
if (mtmd_support_audio(ctx.ctx_vision.get())) {
|
||||
LOG_TEE("\n /audio <path> load an audio");
|
||||
}
|
||||
LOG_TEE("\n /clear clear the chat history");
|
||||
LOG_TEE("\n /quit or /exit exit the program");
|
||||
LOG_TEE("\n");
|
||||
|
||||
bool is_first_msg = true;
|
||||
std::string content;
|
||||
|
||||
while (!g_is_interrupted) {
|
||||
g_is_generating = false;
|
||||
LOG_TEE("\n> ");
|
||||
console::set_display(console::user_input);
|
||||
std::string line;
|
||||
console::readline(line, false);
|
||||
if (g_is_interrupted) break;
|
||||
console::set_display(console::reset);
|
||||
line = string_strip(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (line == "/quit" || line == "/exit") {
|
||||
break;
|
||||
}
|
||||
if (line == "/clear") {
|
||||
ctx.n_past = 0;
|
||||
llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1);
|
||||
//llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
|
||||
LOG_TEE("Chat history cleared\n\n");
|
||||
continue;
|
||||
}
|
||||
g_is_generating = true;
|
||||
bool is_image = line == "/image" || line.find("/image ") == 0;
|
||||
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
|
||||
if (is_image || is_audio) {
|
||||
if (line.size() < 8) {
|
||||
LOG_ERR("ERR: Missing media filename\n");
|
||||
continue;
|
||||
}
|
||||
std::string media_path = line.substr(7);
|
||||
if (ctx.load_media(media_path)) {
|
||||
LOG_TEE("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
|
||||
content += mtmd_default_marker();
|
||||
}
|
||||
// else, error is already printed by libmtmd
|
||||
continue;
|
||||
} else {
|
||||
content += line;
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = content;
|
||||
int ret = eval_message(ctx, msg, is_first_msg);
|
||||
if (ret) {
|
||||
return 1;
|
||||
}
|
||||
if (g_is_interrupted) break;
|
||||
if (generate_response(ctx, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
content.clear();
|
||||
is_first_msg = false;
|
||||
}
|
||||
}
|
||||
if (g_is_interrupted) LOG_TEE("\nInterrupted by user\n");
|
||||
LOG_TEE("\n\n");
|
||||
llama_print_timings(ctx.lctx);
|
||||
//llama_perf_context_print(ctx.lctx);
|
||||
return g_is_interrupted ? 130 : 0;
|
||||
}
|
||||
460
examples/mtmd/mtmd-helper.cpp
Normal file
460
examples/mtmd/mtmd-helper.cpp
Normal file
@@ -0,0 +1,460 @@
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <vector>
|
||||
|
||||
//#define MTMD_AUDIO_DEBUG
|
||||
|
||||
#define MINIAUDIO_IMPLEMENTATION
|
||||
#ifndef MTMD_AUDIO_DEBUG
|
||||
# define MA_NO_ENCODING
|
||||
#endif
|
||||
#define MA_NO_DEVICE_IO
|
||||
#define MA_NO_RESOURCE_MANAGER
|
||||
#define MA_NO_NODE_GRAPH
|
||||
#define MA_NO_ENGINE
|
||||
#define MA_NO_GENERATION
|
||||
#define MA_API static
|
||||
#include "miniaudio/miniaudio.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb/stb_image.h"
|
||||
|
||||
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
||||
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
n_pos += mtmd_input_chunk_get_n_pos(chunk);
|
||||
}
|
||||
return n_pos;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
int n_pos_per_embd;
|
||||
int n_mmproj_embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<llama_pos> pos_view; // used by mrope
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
}
|
||||
|
||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for audio
|
||||
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
if (n_pos_per_embd > 1) {
|
||||
// mrope
|
||||
// for example, with layout of src: 1234...1234...1234...1234...
|
||||
// offset 2 will give us dst: 34...34...34...34...
|
||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||
// assume n_tokens is less than or equal to batch.n_tokens
|
||||
// batch.n_tokens is number of **total** tokens
|
||||
// n_tokens is number of viewed token
|
||||
size_t src_idx = i * batch.n_tokens + offset;
|
||||
pos_view.insert(pos_view.end(),
|
||||
pos.data() + src_idx,
|
||||
pos.data() + src_idx + n_tokens);
|
||||
}
|
||||
pos_ptr = pos_view.data();
|
||||
} else {
|
||||
// normal
|
||||
pos_ptr = pos.data() + offset;
|
||||
}
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
/*logits =*/ batch.logits + offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function for decoding an image whose embeddings have already been calculated
|
||||
int32_t mtmd_helper_decode_image_chunk(
|
||||
mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(lctx);
|
||||
int n_mmproj_embd = llama_model_n_embd(model);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
GGML_ABORT("invalid chunk type for M-RoPE");
|
||||
}
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int pos_offset = i_batch*n_batch;
|
||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||
|
||||
LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
|
||||
i_batch++;
|
||||
}
|
||||
|
||||
n_past += mtmd_input_chunk_get_n_pos(chunk);
|
||||
*new_n_past = n_past;
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
// LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
|
||||
size_t i = 0;
|
||||
while (i < n_tokens) { // split into batches
|
||||
text_batch.n_tokens = 0; // clear the batch
|
||||
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
||||
int32_t j = text_batch.n_tokens;
|
||||
text_batch.token [j] = tokens[i];
|
||||
text_batch.pos [j] = n_past++;
|
||||
text_batch.n_seq_id[j] = 1;
|
||||
text_batch.seq_id [j][0] = seq_id;
|
||||
text_batch.logits [j] = false;
|
||||
|
||||
text_batch.n_tokens++;
|
||||
}
|
||||
bool is_last_token = (i == n_tokens);
|
||||
if (logits_last && is_last_token) {
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
*new_n_past += text_batch.n_tokens;
|
||||
}
|
||||
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
LOG_INF("encoding %s slice...\n", name);
|
||||
|
||||
ret = mtmd_encode_chunk(ctx, chunk);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode %s slice\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("chunk type not supported");
|
||||
}
|
||||
|
||||
llama_batch_free(text_batch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
||||
if (n_chunks == 0) {
|
||||
LOG_ERR("no chunks to eval\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_chunks; i++) {
|
||||
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
|
||||
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
|
||||
if (res != 0) {
|
||||
LOG_ERR("failed to eval chunk %zu\n", i);
|
||||
return res;
|
||||
}
|
||||
*new_n_past = n_past;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace audio_helpers {
|
||||
|
||||
static bool is_audio_file(const char * buf, size_t len) {
|
||||
if (len < 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||
bool is_mp3 = len >= 3 && (
|
||||
memcmp(buf, "ID3", 3) == 0 ||
|
||||
// Check for MPEG sync word (simplified check)
|
||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||
);
|
||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||
|
||||
return is_wav || is_mp3 || is_flac;
|
||||
}
|
||||
|
||||
// returns true if the buffer is a valid audio file
|
||||
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||
ma_result result;
|
||||
const int channels = 1;
|
||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||
ma_decoder decoder;
|
||||
|
||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32_mono.resize(frame_count);
|
||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef MTMD_AUDIO_DEBUG
|
||||
// save audio to wav file
|
||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||
ma_encoder encoder;
|
||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||
ma_encoder_uninit(&encoder);
|
||||
#endif
|
||||
|
||||
ma_decoder_uninit(&decoder);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace audio_helpers
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||
if (bitrate < 0) {
|
||||
LOG_ERR("This model does not support audio input\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||
}
|
||||
|
||||
// otherwise, we assume it's an image
|
||||
mtmd_bitmap * result = nullptr;
|
||||
{
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
result = mtmd_bitmap_init(nx, ny, data);
|
||||
stbi_image_free(data);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
fclose(f);
|
||||
if (n_read != (size_t)file_size) {
|
||||
LOG_ERR("Failed to read entire file %s", fname);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||
}
|
||||
91
examples/mtmd/mtmd-helper.h
Normal file
91
examples/mtmd/mtmd-helper.h
Normal file
@@ -0,0 +1,91 @@
|
||||
#ifndef MTMD_HELPER_H
|
||||
#define MTMD_HELPER_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// libmtmd helper functions
|
||||
//
|
||||
// Please note that these helpers are not guaranteed to be stable.
|
||||
// BREAKING CHANGES are expected.
|
||||
//
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||
// supported formats:
|
||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||
// note: audio files will be auto-detected based on magic bytes
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#endif
|
||||
1066
examples/mtmd/mtmd.cpp
Normal file
1066
examples/mtmd/mtmd.cpp
Normal file
File diff suppressed because it is too large
Load Diff
298
examples/mtmd/mtmd.h
Normal file
298
examples/mtmd/mtmd.h
Normal file
@@ -0,0 +1,298 @@
|
||||
#ifndef MTMD_H
|
||||
#define MTMD_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#endif
|
||||
|
||||
/**
|
||||
* libmtmd: A library for multimodal support in llama.cpp.
|
||||
*
|
||||
* WARNING: This API is experimental and subject to many BREAKING CHANGES.
|
||||
* Issues related to API usage may receive lower priority support.
|
||||
*
|
||||
* For the usage, see an example in mtmd-cli.cpp
|
||||
*/
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define MTMD_API __declspec(dllexport)
|
||||
# else
|
||||
# define MTMD_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define MTMD_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
// deprecated marker, use mtmd_default_marker() instead
|
||||
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum mtmd_input_chunk_type {
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
};
|
||||
|
||||
// opaque types
|
||||
struct mtmd_context;
|
||||
struct mtmd_bitmap;
|
||||
struct mtmd_image_tokens;
|
||||
struct mtmd_input_chunk;
|
||||
struct mtmd_input_chunks;
|
||||
|
||||
struct mtmd_input_text {
|
||||
const char * text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
};
|
||||
|
||||
//
|
||||
// C API
|
||||
//
|
||||
|
||||
typedef struct mtmd_context mtmd_context;
|
||||
typedef struct mtmd_bitmap mtmd_bitmap;
|
||||
typedef struct mtmd_image_tokens mtmd_image_tokens;
|
||||
typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
typedef struct mtmd_input_chunks mtmd_input_chunks;
|
||||
typedef struct mtmd_input_text mtmd_input_text;
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu;
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
enum ggml_log_level verbosity;
|
||||
const char * image_marker; // deprecated, use media_marker instead
|
||||
const char * media_marker;
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
|
||||
MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
|
||||
|
||||
// initialize the mtmd context
|
||||
// return nullptr on failure
|
||||
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const struct llama_model * text_model,
|
||||
const struct mtmd_context_params ctx_params);
|
||||
|
||||
MTMD_API void mtmd_free(mtmd_context * ctx);
|
||||
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||
|
||||
// whether the current model use M-RoPE for llama_decode
|
||||
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports vision input
|
||||
MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports audio input
|
||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||
|
||||
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||
// return -1 if audio is not supported
|
||||
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||
|
||||
// mtmd_bitmap
|
||||
//
|
||||
// if bitmap is image:
|
||||
// length of data must be nx * ny * 3
|
||||
// the data is in RGBRGBRGB... format
|
||||
// if bitmap is audio:
|
||||
// length of data must be n_samples * sizeof(float)
|
||||
// the data is in float format (PCM F32)
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
||||
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
||||
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
||||
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
|
||||
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
|
||||
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
|
||||
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
||||
// bitmap ID is optional, but useful for KV cache tracking
|
||||
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
|
||||
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
||||
MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
|
||||
|
||||
|
||||
// mtmd_input_chunks
|
||||
//
|
||||
// this is simply a list of mtmd_input_chunk
|
||||
// the elements can only be populated via mtmd_tokenize()
|
||||
MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
|
||||
MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
|
||||
MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
|
||||
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
||||
|
||||
// mtmd_input_chunk
|
||||
//
|
||||
// the instance will be constructed via mtmd_tokenize()
|
||||
// it will be freed along with mtmd_input_chunks
|
||||
MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
|
||||
MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
|
||||
MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
|
||||
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
|
||||
// returns nullptr for ID on text chunk
|
||||
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
|
||||
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
||||
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
|
||||
|
||||
// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
|
||||
// you can move the chunk ownership to your own code by copying it
|
||||
// remember to free the chunk when you are done with it
|
||||
MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
|
||||
MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
||||
|
||||
|
||||
// mtmd_image_tokens
|
||||
//
|
||||
// the instance will be constructed via mtmd_tokenize()
|
||||
// it will be freed along with mtmd_input_chunk
|
||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
||||
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
|
||||
// tokenize an input text prompt and a list of bitmaps (images/audio)
|
||||
// the prompt must have the input image marker (default: "<__media__>") in it
|
||||
// the default marker is defined by mtmd_default_marker()
|
||||
// the marker will be replaced with the image/audio chunk
|
||||
// for example:
|
||||
// "here is an image: <__media__>\ndescribe it in detail."
|
||||
// this will gives 3 chunks:
|
||||
// 1. "here is an image: <start_of_image>"
|
||||
// 2. (image/audio tokens)
|
||||
// 3. "<end_of_image>\ndescribe it in detail."
|
||||
// number of bitmaps must be equal to the number of markers in the prompt
|
||||
// this function is thread-safe (shared ctx)
|
||||
// return values:
|
||||
// 0 on success
|
||||
// 1 on number of bitmaps not matching the number of markers
|
||||
// 2 on image preprocessing error
|
||||
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps);
|
||||
|
||||
// returns 0 on success
|
||||
// TODO: deprecate
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
const mtmd_image_tokens * image_tokens);
|
||||
|
||||
// returns 0 on success
|
||||
MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
||||
const mtmd_input_chunk * chunk);
|
||||
|
||||
// get output embeddings from the last encode pass
|
||||
// the reading size (in bytes) is equal to:
|
||||
// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
// test function, to be used in test-mtmd-c-api.c
|
||||
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace mtmd {
|
||||
|
||||
struct mtmd_context_deleter {
|
||||
void operator()(mtmd_context * val) { mtmd_free(val); }
|
||||
};
|
||||
using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
||||
|
||||
struct mtmd_bitmap_deleter {
|
||||
void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
|
||||
};
|
||||
using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
|
||||
|
||||
struct mtmd_input_chunks_deleter {
|
||||
void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
||||
};
|
||||
using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
||||
|
||||
struct mtmd_input_chunk_deleter {
|
||||
void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
|
||||
};
|
||||
using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
|
||||
|
||||
struct bitmap {
|
||||
bitmap_ptr ptr;
|
||||
bitmap() : ptr(nullptr) {}
|
||||
bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
|
||||
bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
|
||||
bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
|
||||
ptr.reset(mtmd_bitmap_init(nx, ny, data));
|
||||
}
|
||||
~bitmap() = default;
|
||||
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
};
|
||||
|
||||
struct bitmaps {
|
||||
std::vector<bitmap> entries;
|
||||
~bitmaps() = default;
|
||||
// return list of pointers to mtmd_bitmap
|
||||
// example:
|
||||
// auto bitmaps_c_ptr = bitmaps.c_ptr();
|
||||
// int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
|
||||
std::vector<const mtmd_bitmap *> c_ptr() {
|
||||
std::vector<const mtmd_bitmap *> res(entries.size());
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
res[i] = entries[i].ptr.get();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
struct input_chunks {
|
||||
input_chunks_ptr ptr;
|
||||
input_chunks() = default;
|
||||
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
|
||||
~input_chunks() = default;
|
||||
size_t size() { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) {
|
||||
return mtmd_input_chunks_get(ptr.get(), idx);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mtmd
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
5
examples/mtmd/requirements.txt
Normal file
5
examples/mtmd/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
-r ../../requirements/requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pillow~=11.3.0
|
||||
torch~=2.6.0
|
||||
torchvision~=0.21.0
|
||||
BIN
examples/mtmd/test-1.jpeg
Normal file
BIN
examples/mtmd/test-1.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 121 KiB |
BIN
examples/mtmd/test-2.mp3
Normal file
BIN
examples/mtmd/test-2.mp3
Normal file
Binary file not shown.
163
examples/mtmd/tests.sh
Executable file
163
examples/mtmd/tests.sh
Executable file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# make sure we are in the right directory
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd $SCRIPT_DIR
|
||||
|
||||
#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
|
||||
|
||||
set -eux
|
||||
|
||||
mkdir -p $SCRIPT_DIR/output
|
||||
|
||||
PROJ_ROOT="$SCRIPT_DIR/../.."
|
||||
cd $PROJ_ROOT
|
||||
|
||||
# Check if the first argument is "big", then run test with big models
|
||||
# This is useful if we're running the script on a larger machine, so we can test the big models
|
||||
RUN_BIG_TESTS=false
|
||||
if [ "${1:-}" = "big" ]; then
|
||||
RUN_BIG_TESTS=true
|
||||
echo "Include BIG models..."
|
||||
fi
|
||||
|
||||
RUN_HUGE_TESTS=false
|
||||
if [ "${1:-}" = "huge" ]; then
|
||||
RUN_HUGE_TESTS=true
|
||||
RUN_BIG_TESTS=true
|
||||
echo "Include BIG and HUGE models..."
|
||||
fi
|
||||
|
||||
###############
|
||||
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_file=()
|
||||
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
fi
|
||||
|
||||
# to test the huge models, run: ./tests.sh huge
|
||||
# this will run both the big and huge models
|
||||
# huge models are > 32B parameters
|
||||
if [ "$RUN_HUGE_TESTS" = true ]; then
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
|
||||
fi
|
||||
|
||||
# these models always give the wrong answer, not sure why
|
||||
# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
||||
# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
||||
|
||||
# this model has broken chat template, not usable
|
||||
# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target llama-mtmd-cli
|
||||
|
||||
arr_res=()
|
||||
|
||||
for i in "${!arr_hf[@]}"; do
|
||||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
2>&1 | tee /dev/tty)
|
||||
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
if echo "$output" | grep -iq "new york"; then
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
else
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo ""
|
||||
echo "#################################################"
|
||||
echo "#################################################"
|
||||
echo ""
|
||||
echo ""
|
||||
done
|
||||
|
||||
set +x
|
||||
|
||||
for i in "${!arr_res[@]}"; do
|
||||
echo -e "${arr_res[$i]}"
|
||||
done
|
||||
echo ""
|
||||
echo "Output logs are saved in $SCRIPT_DIR/output"
|
||||
@@ -1235,7 +1235,7 @@ struct server_context {
|
||||
|
||||
chat_templates = common_chat_templates_init(model, params.chat_template);
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja);
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja, {});
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
LOG_WARNING("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
@@ -3778,7 +3778,7 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
LOG_INFO("chat template", {
|
||||
{"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params.use_jinja).c_str()
|
||||
{"chat_example", common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params.use_jinja, {}).c_str()
|
||||
},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user