imatrix: wv_b <-> wkv_b

2026-02-24 23:24:13 +00:00 · 2025-03-10 15:01:56 +02:00
parent cfec33848f
commit 56921ccd49
4 changed files with 21 additions and 3 deletions
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -195,7 +195,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
-        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+        for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -556,7 +556,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
            return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-            return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+            return (void*) cpy_f32_f16<cpy_1_f16_f16>;
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
            return (void*) cpy_f32_f16<cpy_1_f16_f32>;
    } else {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -10468,7 +10468,7 @@ static void ggml_compute_forward_dup_bytes(
    if (ggml_is_contiguous(dst)) {
        size_t id = 0;
        char * dst_ptr = (char *) dst->data;
-        const size_t rs = ne00 * type_size;
+        const size_t rs = ggml_row_size(src0->type, ne00); //ne00 * type_size;

        if (nb00 == type_size) {
            // src0 is contigous on first dimension, copy by rows
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13787,6 +13787,7 @@ struct llm_build_context {
                                ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
                                ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank)*n_embd_head_v, 0);
                        cb(wv_b, "wv_b", il);
+                        std::memcpy(wv_b->name, model.layers[il].wv_b->name, GGML_MAX_NAME);

                        kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
                        cb(kqv, "kqv", il);
@@ -17347,6 +17348,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const float * imatrix = nullptr;
            if (imatrix_data) {
                auto it = imatrix_data->find(tensor->name);
+                if (it == imatrix_data->end()) {
+                    // MLA hack: most imatrix files floating around the Internet have been computed with standard attention.
+                    //           This means that the imatrix file does not contain data for the *.attn_k_b.weight and *.attn_v_b.weight
+                    //           required by MLA. But the *.attn_v_b.weight tensors "see" the exact same activations as the
+                    //           *.attn_kv_b.weight tensors used in standard attention. Hence, if we find imatrix data for
+                    //           *.attn_kv_b.weight we can use it for *.attn_v_b.weight and vice versa.
+                    std::string name{tensor->name};
+                    static std::array<std::string, 2> alternatives{".attn_v_b.weight", ".attn_kv_b.weight"};
+                    for (int j = 0; j < int(alternatives.size()); ++j) {
+                        if (auto pos = name.find(alternatives[j]); pos != std::string::npos) {
+                            int j1 = (j + 1) % alternatives.size();
+                            auto alternative_name = name.substr(0, pos) + alternatives[j1];
+                            it = imatrix_data->find(alternative_name);
+                            break;
+                        }
+                    }
+                }
                if (it == imatrix_data->end()) {
                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                } else {