Port of Qwen3-VL support from mainline (#883)

* Port of Qwen3-VL for latest ik_llama.cpp - convert_hf_to_gguf.py - Not touched, use llama.cpp to convert model instead - sysl and metal support for imrope not added - Vulkan support for imrope not tested - Code not tested * Bugfix n_embd was declared multiple times https://github.com/ikawrakow/ik_llama.cpp/pull/883#issuecomment-3471179655 * Fix n_embd issue with qwen3vl * model.output tensor not required https://github.com/ikawrakow/ik_llama.cpp/pull/883#discussion_r2480388389 * Improved logic for qkv combined tensors 59ceaf8fcb (r2480395800) 59ceaf8fcb (r2480398187) * Fix n_embd for merge_qkv() + cleaner code https://github.com/ikawrakow/ik_llama.cpp/pull/883#discussion_r2481227395 * Revert TENSOR_NOT_REQUIRED
2026-04-28 10:21:48 +00:00 · 2025-11-04 17:20:54 +00:00
parent efcb5f9d9e
commit 86597623a5
21 changed files with 850 additions and 78 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1165,6 +1165,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+        // MRoPE (Multi-axis Rotary Position Embedding) sections
+        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
+            LLAMA_LOG_INFO("%s: mrope sections   = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
+        }
        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
@@ -1230,7 +1234,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
    }

-    if (model.arch == LLM_ARCH_QWEN3MOE || model.arch == LLM_ARCH_OPENAI_MOE) {
+    if (model.arch == LLM_ARCH_QWEN3MOE || model.arch == LLM_ARCH_OPENAI_MOE || model.arch == LLM_ARCH_QWEN3VLMOE) {
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
    }

@@ -2054,7 +2058,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        auto tim1 = ggml_time_us();
 #endif
        const int64_t n_tokens = batch.n_tokens;
-        const int n_pos_per_embd =  hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+        const int n_pos_per_embd =  hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
        if (batch.token && n_pos_per_embd == 4) {
            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
            for (int i = 0; i < n_tokens; ++i) {
@@ -4668,6 +4672,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_QWEN2VL:
            return LLAMA_ROPE_TYPE_MROPE;

+        case LLM_ARCH_QWEN3VL:
+        case LLM_ARCH_QWEN3VLMOE:
+            return LLAMA_ROPE_TYPE_IMROPE;
+
        // all model arches should be listed explicitly here
        case LLM_ARCH_UNKNOWN:
            GGML_ABORT("unknown architecture");