From 34be9d8d5709051fcd8367b5e6c06f3e35633aad Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Mon, 14 Apr 2025 10:03:39 +0300
Subject: [PATCH] imatrix: collect layer influence statistics

---
 examples/imatrix/imatrix.cpp | 100 ++++++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index d8a43049..9fc4ce1d 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -19,6 +19,8 @@
 #include <fstream>
 #include <unordered_map>
 #include <algorithm>
+#include <optional>
+#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -49,13 +51,19 @@ public:
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix(int ncall = -1) const;
     bool load_imatrix(const char * file_name);
+    void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
+    void print_layer_importance();
 private:
     std::unordered_map<std::string, Stats> m_stats;
     gpt_params                             m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
+    int                                    m_last_layer = 9999;
     std::vector<float>                     m_src1_data;
     std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<float>                     m_last_input;
+    std::vector<std::pair<double,int>>     m_layer_sim;
+    bool                                   m_collect_lsim = false;
 };
 
 // remove any prefix and suffixes from the name
@@ -77,6 +85,37 @@ static std::string filter_tensor_name(const char * name) {
     return wname;
 }
 
+static std::optional<int> layer_index(const std::string& name) {
+    if (auto pos = name.find("blk."); pos == 0) {
+        pos += 4;
+        if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
+            auto index_str = name.substr(pos, pos1 - pos);
+            std::istringstream str(index_str);
+            int index; str >> index;
+            if (!str.fail()) return index;
+        }
+    }
+    return std::nullopt;
+}
+
+void IMatrixCollector::print_layer_importance() {
+    printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
+    if (m_layer_sim.empty()) return;
+    std::vector<std::pair<float, int>> layers;
+    layers.reserve(m_layer_sim.size());
+    for (int i = 0; i < int(m_layer_sim.size()); ++i) {
+        if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
+    }
+    if (layers.empty()) return;
+    std::sort(layers.begin(), layers.end());
+    printf("======================== sorted layer importances\n");
+    int j = 0;
+    for (auto& p : layers) {
+        int i = p.second;
+        printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
+    }
+}
+
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
@@ -182,6 +221,45 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             }
         }
     } else {
+        if (m_collect_lsim) {
+            // We only need to do it here and not in the MoE branch above because the first tensor in a layer
+            // never is a MoE tensor
+            if (auto index = layer_index(src0->name); index.has_value()) {
+                if (*index != m_last_layer) {
+                    if (*index > 0) {
+                        if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
+                            printf("Oops: different size (%d vs %d)\n", (int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size());
+                            exit(1);
+                        }
+                        if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
+                        auto& p = m_layer_sim[*index - 1];
+                        auto x = m_last_input.data();
+                        auto y = (const float *)data;
+                        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                            double sumxy = 0, sumx2 = 0, sumy2 = 0;
+                            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                                sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
+                            }
+                            double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
+                            p.first  += cos_sim;
+                            p.second += 1;
+                            x += src1->ne[0];
+                            y += src1->ne[0];
+                        }
+                    }
+                    m_last_layer = *index;
+                    if (m_last_input.empty()) {
+                        m_last_input.resize(src1->ne[0]*src1->ne[1]);
+                    } else {
+                        if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
+                            printf("Oops\n"); exit(1);
+                        }
+                    }
+                    //printf("Copying src1 to m_last_input\n");
+                    std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
+                }
+            }
+        }
         auto & e = m_stats[wname];
         if (e.values.empty()) {
             e.values.resize(src1->ne[0], 0);
@@ -622,7 +700,25 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.verbosity = 1;
 
-    if (!gpt_params_parse(argc, argv, params)) {
+    bool lsim = false;
+    //
+    // Do not pollute common with totally imatrix specific arguments as it was done in mainline.
+    // Instead, parse imatrix specific args here, push unknown args into a new array of args,
+    // and pass that to gpt_params_parse().
+    //
+    std::vector<char*> args;
+    args.reserve(argc);
+    args.push_back(argv[0]);
+    for (int i = 1; i < argc; ++i) {
+        std::string arg{argv[i]};
+        if (arg == "-lsim" || arg == "--layer-similarity") {
+            lsim = true;
+        } else {
+            args.push_back(argv[i]);
+        }
+    }
+
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
         print_usage(argc, argv, params);
         return 1;
     }
@@ -630,6 +726,7 @@ int main(int argc, char ** argv) {
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     g_collector.set_params(params);
+    g_collector.set_collect_lsim(lsim);
 
     for (const auto & in_file : params.in_files) {
         printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
@@ -680,6 +777,7 @@ int main(int argc, char ** argv) {
     }
 
     g_collector.save_imatrix();
+    g_collector.print_layer_importance();
 
     llama_print_timings(ctx);