Mimo-V2-Flash support (#1096)

* Mimo-2 support * Fix bug for head sizes not being the same It still does not solve the Mimo-2 quantized cache issue. * Fix quantized cache * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-30 11:21:56 +00:00 · 2026-01-05 08:00:01 +02:00
parent 56dceefd6b
commit ab50c6cdcb
12 changed files with 251 additions and 54 deletions
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -113,7 +113,7 @@ struct llama_hparams {

 	// qwen3vl deepstack
    uint32_t n_deepstack_layers = 0;
-	
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = -1;
@@ -122,6 +122,8 @@ struct llama_hparams {
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;

+    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+
    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only    != other.vocab_only)    return true;
        if (this->n_vocab       != other.n_vocab)       return true;