Port mdmd from mainline + Qwen2/2.5-VL support (#798)

* Add mtmd: the beginning

* Add mtmd: mtmd.cpp compiles

* Add mtmd: clip initialization compiles

* Add mtmd: clip.cpp compiles

* Add mtmd: builds successfully

* Add CPU implementation for GGML_OP_GLU

* Add CUDA implementation for GGML_OP_GLU

* Add CPU implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW

* Add CUDA implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW

* Add mtmd: refresh CPU rope

* Add mtmd: refresh CUDA rope

* Add mtmd: add Qwen2-VL

* Add mtmd: Qwen2.5-VL text seems to work with this change

* Add mtmd: fix swiglu

* Add mtmd: use LOG_TEE so generated tokens show up in terminal

* Add mtmd: do not attempt to load a GPU backend if none are available

* GLU, not GPU

* Fix typo

* Fix new/free mismatch

* LOG stuff

* Add mtmd: this fixes gibberish on second image

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-09-27 08:45:29 +02:00
committed by GitHub
parent 7d8d232896
commit c1a0e15377
51 changed files with 115141 additions and 432 deletions

View File

@@ -68,6 +68,29 @@ struct llama_control_vector_load_info;
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_COUNT,
};
//
// CLI argument parsing
//
@@ -86,6 +109,14 @@ enum common_reasoning_format {
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
};
struct model_paths {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@@ -230,8 +261,10 @@ struct gpt_params {
std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
// multimodal models (see examples/mtmd)
model_paths mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// embedding