From 34be9d8d5709051fcd8367b5e6c06f3e35633aad Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 14 Apr 2025 10:03:39 +0300 Subject: [PATCH] imatrix: collect layer influence statistics --- examples/imatrix/imatrix.cpp | 100 ++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index d8a43049..9fc4ce1d 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -19,6 +19,8 @@ #include #include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -49,13 +51,19 @@ public: bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * file_name); + void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; } + void print_layer_importance(); private: std::unordered_map m_stats; gpt_params m_params; std::mutex m_mutex; int m_last_call = 0; + int m_last_layer = 9999; std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id + std::vector m_last_input; + std::vector> m_layer_sim; + bool m_collect_lsim = false; }; // remove any prefix and suffixes from the name @@ -77,6 +85,37 @@ static std::string filter_tensor_name(const char * name) { return wname; } +static std::optional layer_index(const std::string& name) { + if (auto pos = name.find("blk."); pos == 0) { + pos += 4; + if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) { + auto index_str = name.substr(pos, pos1 - pos); + std::istringstream str(index_str); + int index; str >> index; + if (!str.fail()) return index; + } + } + return std::nullopt; +} + +void IMatrixCollector::print_layer_importance() { + printf("%s: have %d layers\n", __func__, int(m_layer_sim.size())); + if (m_layer_sim.empty()) return; + std::vector> layers; + layers.reserve(m_layer_sim.size()); + for (int i = 0; i < int(m_layer_sim.size()); ++i) { + if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i); + } + if (layers.empty()) return; + std::sort(layers.begin(), layers.end()); + printf("======================== sorted layer importances\n"); + int j = 0; + for (auto& p : layers) { + int i = p.second; + printf("%3d: Layer %3d, = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second); + } +} + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); @@ -182,6 +221,45 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } else { + if (m_collect_lsim) { + // We only need to do it here and not in the MoE branch above because the first tensor in a layer + // never is a MoE tensor + if (auto index = layer_index(src0->name); index.has_value()) { + if (*index != m_last_layer) { + if (*index > 0) { + if (m_last_input.size() != src1->ne[0]*src1->ne[1]) { + printf("Oops: different size (%d vs %d)\n", (int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size()); + exit(1); + } + if (*index > m_layer_sim.size()) m_layer_sim.resize(*index); + auto& p = m_layer_sim[*index - 1]; + auto x = m_last_input.data(); + auto y = (const float *)data; + for (int row = 0; row < (int)src1->ne[1]; ++row) { + double sumxy = 0, sumx2 = 0, sumy2 = 0; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j]; + } + double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0; + p.first += cos_sim; + p.second += 1; + x += src1->ne[0]; + y += src1->ne[0]; + } + } + m_last_layer = *index; + if (m_last_input.empty()) { + m_last_input.resize(src1->ne[0]*src1->ne[1]); + } else { + if (m_last_input.size() != src1->ne[0]*src1->ne[1]) { + printf("Oops\n"); exit(1); + } + } + //printf("Copying src1 to m_last_input\n"); + std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float)); + } + } + } auto & e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); @@ -622,7 +700,25 @@ int main(int argc, char ** argv) { params.logits_all = true; params.verbosity = 1; - if (!gpt_params_parse(argc, argv, params)) { + bool lsim = false; + // + // Do not pollute common with totally imatrix specific arguments as it was done in mainline. + // Instead, parse imatrix specific args here, push unknown args into a new array of args, + // and pass that to gpt_params_parse(). + // + std::vector args; + args.reserve(argc); + args.push_back(argv[0]); + for (int i = 1; i < argc; ++i) { + std::string arg{argv[i]}; + if (arg == "-lsim" || arg == "--layer-similarity") { + lsim = true; + } else { + args.push_back(argv[i]); + } + } + + if (!gpt_params_parse(args.size(), args.data(), params)) { print_usage(argc, argv, params); return 1; } @@ -630,6 +726,7 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, params.n_ctx); g_collector.set_params(params); + g_collector.set_collect_lsim(lsim); for (const auto & in_file : params.in_files) { printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); @@ -680,6 +777,7 @@ int main(int argc, char ** argv) { } g_collector.save_imatrix(); + g_collector.print_layer_importance(); llama_print_timings(ctx);