Convert models to row-interleaved quants using the quantize tool (#272)

* Repack a model with the quantize tool * WIP * Fixed various issues As we don't have a way to tell if a repacked quant has been modified, I had to remove the modification at the expense of a slight decrease in performance. This affects q8_0_r8, q8_KV_r8, q8_k_r8 on Zen4, and q4_0_r8 on ARM. * Create wk_b and wv_b as Q8_0_R8 if the wkv_b type is interleaved * Fix GCC 13.3 compilation error * Another one * Add missing include --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-07 06:50:09 +00:00 · 2025-03-21 07:23:36 +01:00
parent 712de34b12
commit c5e554f941
7 changed files with 246 additions and 58 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8194,7 +8194,8 @@ static bool llm_load_tensors(
                auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview);
                wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32);

-                auto new_type = ggml_is_quantized(wkv_b.type) ? GGML_TYPE_Q8_0 : wkv_b.type;
+                auto new_type = ggml_is_quantized(wkv_b.type) ?
+                    wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type;
                auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type);
                wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t);

@@ -8218,6 +8219,9 @@ static bool llm_load_tensors(
                ggml_set_name(l.computed_wk_b.get(), name.c_str());
                ggml_backend_buffer_set_usage(l.computed_wk_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
                ggml_backend_tensor_set(l.computed_wk_b.get(), wk_b->data, 0, ggml_nbytes(wk_b));
+                if (ggml_backend_buffer_is_host(l.computed_wk_b->buffer)) {
+                    iqk_modify_tensor(l.computed_wk_b.get());
+                }

                l.wk_b = l.computed_wk_b.get();

@@ -8243,6 +8247,9 @@ static bool llm_load_tensors(
                ggml_set_name(l.computed_wv_b.get(), name.c_str());
                ggml_backend_buffer_set_usage(l.computed_wv_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
                ggml_backend_tensor_set(l.computed_wv_b.get(), wv_b->data, 0, ggml_nbytes(wv_b));
+                if (ggml_backend_buffer_is_host(l.computed_wv_b->buffer)) {
+                    iqk_modify_tensor(l.computed_wv_b.get());
+                }

                l.wv_b = l.computed_wv_b.get();

@@ -17140,11 +17147,48 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
    return new_size;
 }

+static llama_ftype repacked_ftype(llama_ftype ftype) {
+    static std::unordered_map<llama_ftype, llama_ftype> k_map = {
+        { LLAMA_FTYPE_MOSTLY_Q4_0,    LLAMA_FTYPE_MOSTLY_Q4_0_R8    },
+        { LLAMA_FTYPE_MOSTLY_Q8_0,    LLAMA_FTYPE_MOSTLY_Q8_0_R8    },
+        { LLAMA_FTYPE_MOSTLY_Q5_0,    LLAMA_FTYPE_MOSTLY_Q5_0_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q2_K,    LLAMA_FTYPE_MOSTLY_Q2_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q3_K_S,  LLAMA_FTYPE_MOSTLY_Q3_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q3_K_M,  LLAMA_FTYPE_MOSTLY_Q3_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q3_K_L,  LLAMA_FTYPE_MOSTLY_Q3_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q4_K_S,  LLAMA_FTYPE_MOSTLY_Q4_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q4_K_M,  LLAMA_FTYPE_MOSTLY_Q4_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q5_K_S,  LLAMA_FTYPE_MOSTLY_Q5_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q5_K_M,  LLAMA_FTYPE_MOSTLY_Q5_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_Q6_K,    LLAMA_FTYPE_MOSTLY_Q6_K_R4    },
+        { LLAMA_FTYPE_MOSTLY_IQ2_XXS, LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 },
+        { LLAMA_FTYPE_MOSTLY_IQ2_XS,  LLAMA_FTYPE_MOSTLY_IQ2_XS_R4  },
+        { LLAMA_FTYPE_MOSTLY_IQ3_XXS, LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 },
+        { LLAMA_FTYPE_MOSTLY_IQ1_S,   LLAMA_FTYPE_MOSTLY_IQ1_S_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ4_NL,  LLAMA_FTYPE_MOSTLY_IQ4_NL_R4  },
+        { LLAMA_FTYPE_MOSTLY_IQ3_S,   LLAMA_FTYPE_MOSTLY_IQ3_S_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ2_M,   LLAMA_FTYPE_MOSTLY_IQ2_M_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ4_XS,  LLAMA_FTYPE_MOSTLY_IQ4_XS_R8  },
+        { LLAMA_FTYPE_MOSTLY_IQ1_M,   LLAMA_FTYPE_MOSTLY_IQ1_M_R4   },
+        { LLAMA_FTYPE_MOSTLY_Q6_0,    LLAMA_FTYPE_MOSTLY_Q6_0_R4    },
+        { LLAMA_FTYPE_MOSTLY_BF16,    LLAMA_FTYPE_MOSTLY_BF16_R16   },
+        { LLAMA_FTYPE_MOSTLY_IQ2_BN,  LLAMA_FTYPE_MOSTLY_IQ2_BN_R4  },
+        { LLAMA_FTYPE_MOSTLY_IQ2_K,   LLAMA_FTYPE_MOSTLY_IQ2_K_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ3_K,   LLAMA_FTYPE_MOSTLY_IQ3_K_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ4_K,   LLAMA_FTYPE_MOSTLY_IQ4_K_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ5_K,   LLAMA_FTYPE_MOSTLY_IQ5_K_R4   },
+        { LLAMA_FTYPE_MOSTLY_IQ4_KS,  LLAMA_FTYPE_MOSTLY_IQ4_KS_R4  },
+        { LLAMA_FTYPE_MOSTLY_Q8_KV,   LLAMA_FTYPE_MOSTLY_Q8_KV_R8   },
+    };
+    if (auto it = k_map.find(ftype); it != k_map.end()) return it->second;
+    return ftype;
+}
+
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
    ggml_type default_type;
    llama_ftype ftype = params->ftype;

-    switch (params->ftype) {
+    switch (ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
        case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
        case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
@@ -17256,7 +17300,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        ftype = model.ftype;
    }
    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
-    if (params->imatrix) {
+    if (!params->only_repack && params->imatrix) {
        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
        if (imatrix_data) {
            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
@@ -17278,7 +17322,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, ml.meta);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV

    // Remove split metadata
    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
@@ -17303,9 +17346,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        }
    }

+    bool is_repacked = ml.ftype >= LLAMA_FTYPE_MOSTLY_Q4_0_R8 && ml.ftype <= LLAMA_FTYPE_MOSTLY_Q8_K_R8;
+    int n_to_repack = 0, n_to_modify = 0;
    for (int i = 0; i < ml.n_tensors; ++i) {
        const struct ggml_tensor * meta = ml.get_tensor_meta(i);

+        if (params->only_repack) {
+            auto repacked_type = (ggml_type)iqk_repacked_type(meta);
+            if (repacked_type != meta->type) {
+                ++n_to_repack;
+            } else if (!is_repacked) {
+                if (iqk_should_modify_tensor(meta)) ++n_to_modify;
+            }
+        }
+
        const std::string name = ggml_get_name(meta);

        // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -17317,6 +17371,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        }
    }

+    if (params->only_repack) {
+        if (n_to_repack == 0 && n_to_modify == 0) {
+            printf("=========================== %s: nothing to do for only_repack option\n", __func__);
+            return;
+        }
+        ftype = repacked_ftype(model.ftype);
+        printf("===================== Model ftype: %s: Repacked ftype: %s\n", llama_model_ftype_name(model.ftype).c_str(),
+                llama_model_ftype_name(ftype).c_str());
+    }
+
+    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
+
    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

    // sanity checks
@@ -17457,6 +17523,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        void * new_data;
        size_t new_size;

+        if (params->only_repack) {
+            ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
+            bool modify = !is_repacked && iqk_should_modify_tensor(tensor);
+            if (modify || repacked_type != tensor->type) {
+                new_type = repacked_type;
+                new_size = ggml_nbytes(tensor);
+                if ((int)work.size() < new_size) work.resize(new_size);
+                new_data = work.data();
+
+                auto aux_tensor = *tensor;
+                aux_tensor.data = work.data();
+                std::memcpy(aux_tensor.data, tensor->data, new_size);
+
+                if (repacked_type != tensor->type) {
+                    iqk_repack_tensor(&aux_tensor);
+                    GGML_ASSERT(aux_tensor.type == repacked_type);
+                } else {
+                    bool did_modify = iqk_modify_tensor(&aux_tensor);
+                    GGML_ASSERT(did_modify);
+                }
+            }
+            else {
+                new_type = tensor->type;
+                new_size = ggml_nbytes(tensor);
+                new_data = tensor->data;
+            }
+            LLAMA_LOG_INFO("size = %8.3f MB, type = %s\n", new_size/1024.0/1024.0, ggml_type_name(new_type));
+            goto QuantizationDone;
+        }
+
        if (quantize) {
            new_type = default_type;
            if (new_type == GGML_TYPE_BF16_R16 && strcmp(tensor->name, "token_embd.weight") == 0) {
@@ -17562,7 +17658,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 new_type == GGML_TYPE_IQ1_S_R4||
                 new_type == GGML_TYPE_IQ1_M_R4||
                (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
+                (new_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
                LLAMA_LOG_ERROR("\n\n============================================================\n");
                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -17727,6 +17823,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            }
            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
        }
+
+QuantizationDone:;
        total_size_org += ggml_nbytes(tensor);
        total_size_new += new_size;

@@ -18051,6 +18149,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.pure                        =*/ false,
        /*.keep_split                  =*/ false,
        /*.ignore_imatrix_rules        =*/ false,
+        /*.only_repack                 =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.custom_quants               =*/ nullptr,