imatrix: collect layer influence statistics (#328)

* imatrix: collect layer influence statistics

* imatrix: collect layer influence statiscs also for the last layer

For the last layer we need to use the input for the output.weight
tensor. Last layer(s) tend(s) to be important, so it is useful to also
have its influence metric.

* imatrix: separate metric for attention and ffn importance

* Use stripped tensor name, not src0->name

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-04-14 19:43:19 +02:00
committed by GitHub
parent 9be8812727
commit 70a1d99fb8

View File

@@ -19,6 +19,8 @@
#include <fstream>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <sstream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -49,13 +51,59 @@ public:
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name);
void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
void print_layer_importance();
private:
std::unordered_map<std::string, Stats> m_stats;
gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
int m_last_layer = 9999;
int m_last_ffn = -1;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
std::vector<float> m_last_input;
std::vector<float> m_ffn_input;
std::vector<std::pair<double,int>> m_layer_sim;
std::vector<std::pair<double,int>> m_attn_sim;
std::vector<std::pair<double,int>> m_ffn_sim;
bool m_collect_lsim = false;
std::optional<int> layer_index(const std::string& name) const {
if (name == m_params.output_tensor_name && m_last_layer < 199) {
return m_last_layer + 1;
}
if (auto pos = name.find("blk."); pos == 0) {
pos += 4;
if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
auto index_str = name.substr(pos, pos1 - pos);
std::istringstream str(index_str);
int index; str >> index;
if (!str.fail()) return index;
}
}
return std::nullopt;
}
static inline double cosine_similarity(int n, const float * x, const float * y) {
double sumxy = 0, sumx2 = 0, sumy2 = 0;
for (int j = 0; j < n; ++j) {
sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
}
double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
return cos_sim;
}
static inline void collect_cos_similarity(int nrow, int n, const float * x, const float * y, std::pair<double, int>& p) {
for (int row = 0; row < nrow; ++row) {
p.first += cosine_similarity(n, x, y);
p.second += 1;
x += n;
y += n;
}
}
static void print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim);
};
// remove any prefix and suffixes from the name
@@ -77,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
return wname;
}
void IMatrixCollector::print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim) {
if (sim.empty()) return;
std::vector<std::pair<float, int>> layers;
layers.reserve(sim.size());
for (int i = 0; i < int(sim.size()); ++i) {
if (sim[i].second > 0) layers.emplace_back(float(std::abs(sim[i].first/sim[i].second)), i);
}
if (layers.empty()) return;
std::sort(layers.begin(), layers.end());
printf("%s\n", msg);
//printf("======================== sorted layer importances\n");
int j = 0;
for (auto& p : layers) {
int i = p.second;
printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, sim[i].first/sim[i].second);
}
}
void IMatrixCollector::print_layer_importance() {
print_layer_importance("\n======================== sorted layer importances", m_layer_sim);
print_layer_importance("\n======================== sorted attention importances", m_attn_sim);
print_layer_importance("\n======================== sorted ffn importances", m_ffn_sim);
//printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
//if (m_layer_sim.empty()) return;
//std::vector<std::pair<float, int>> layers;
//layers.reserve(m_layer_sim.size());
//for (int i = 0; i < int(m_layer_sim.size()); ++i) {
// if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
//}
//if (layers.empty()) return;
//std::sort(layers.begin(), layers.end());
//printf("======================== sorted layer importances\n");
//int j = 0;
//for (auto& p : layers) {
// int i = p.second;
// printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
//}
}
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
GGML_UNUSED(user_data);
@@ -92,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
//printf("wname = %s\n", wname.c_str());
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == m_params.output_tensor_name))) return false;
if (!(wname.substr(0, 4) == "blk." || ((m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name))) return false;
return true;
}
@@ -108,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
if (m_collect_lsim) {
if (wname.find(".ffn_") != std::string::npos) {
if (auto index = layer_index(wname); index.has_value() && *index == m_last_layer && *index != m_last_ffn) {
int n = src1->ne[0];
int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne[2] : src1->ne[1];
if (t->op == GGML_OP_MUL_MAT_ID) {
GGML_ASSERT(src1->ne[1] == 1);
}
if (m_ffn_input.empty()) {
m_ffn_input.resize(nrow*n);
} else {
if ((int)m_ffn_input.size() != nrow*n) {
printf("Oops, inconsistent ffn size\n"); exit(1);
}
}
std::memcpy(m_ffn_input.data(), data, nrow*n*sizeof(float));
if (m_ffn_input.size() != m_last_input.size()) {
printf("Oops, inconsistent ffn vs last_input size\n"); exit(1);
}
if (m_attn_sim.size() < *index + 1) m_attn_sim.resize(*index + 1);
auto& p = m_attn_sim[*index];
collect_cos_similarity(nrow, n, m_ffn_input.data(), m_last_input.data(), p);
m_last_ffn = *index;
}
}
}
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -182,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
}
} else {
if (m_collect_lsim) {
// We only need to do it here and not in the MoE branch above because the first tensor in a layer
// never is a MoE tensor
if (auto index = layer_index(wname); index.has_value()) {
if (*index != m_last_layer) {
if (*index > 0) {
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
printf("Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n",
(int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size(), src0->name, m_last_layer);
exit(1);
}
if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
auto& p = m_layer_sim[*index - 1];
collect_cos_similarity(src1->ne[1], src1->ne[0], m_last_input.data(), (const float *)data, p);
if (*index == m_last_ffn + 1) {
if (*index > m_ffn_sim.size()) m_ffn_sim.resize(*index);
auto& p1 = m_ffn_sim[*index-1];
collect_cos_similarity(src1->ne[1], src1->ne[0], m_ffn_input.data(), (const float *)data, p1);
}
}
m_last_layer = *index;
if (m_last_input.empty()) {
m_last_input.resize(src1->ne[0]*src1->ne[1]);
} else {
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
printf("Oops\n"); exit(1);
}
}
//printf("Copying src1 to m_last_input\n");
std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
}
}
}
auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
@@ -622,7 +769,25 @@ int main(int argc, char ** argv) {
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
bool lsim = false;
//
// Do not pollute common with totally imatrix specific arguments as it was done in mainline.
// Instead, parse imatrix specific args here, push unknown args into a new array of args,
// and pass that to gpt_params_parse().
//
std::vector<char*> args;
args.reserve(argc);
args.push_back(argv[0]);
for (int i = 1; i < argc; ++i) {
std::string arg{argv[i]};
if (arg == "-lsim" || arg == "--layer-similarity") {
lsim = true;
} else {
args.push_back(argv[i]);
}
}
if (!gpt_params_parse(args.size(), args.data(), params)) {
print_usage(argc, argv, params);
return 1;
}
@@ -630,6 +795,7 @@ int main(int argc, char ** argv) {
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
g_collector.set_collect_lsim(lsim);
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
@@ -680,6 +846,7 @@ int main(int argc, char ** argv) {
}
g_collector.save_imatrix();
g_collector.print_layer_importance();
llama_print_timings(ctx);