mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
imatrix: collect layer influence statistics
This commit is contained in:
@@ -19,6 +19,8 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <optional>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
@@ -49,13 +51,19 @@ public:
|
|||||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix(int ncall = -1) const;
|
void save_imatrix(int ncall = -1) const;
|
||||||
bool load_imatrix(const char * file_name);
|
bool load_imatrix(const char * file_name);
|
||||||
|
void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
|
||||||
|
void print_layer_importance();
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
gpt_params m_params;
|
gpt_params m_params;
|
||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
int m_last_call = 0;
|
int m_last_call = 0;
|
||||||
|
int m_last_layer = 9999;
|
||||||
std::vector<float> m_src1_data;
|
std::vector<float> m_src1_data;
|
||||||
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
||||||
|
std::vector<float> m_last_input;
|
||||||
|
std::vector<std::pair<double,int>> m_layer_sim;
|
||||||
|
bool m_collect_lsim = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
// remove any prefix and suffixes from the name
|
// remove any prefix and suffixes from the name
|
||||||
@@ -77,6 +85,37 @@ static std::string filter_tensor_name(const char * name) {
|
|||||||
return wname;
|
return wname;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::optional<int> layer_index(const std::string& name) {
|
||||||
|
if (auto pos = name.find("blk."); pos == 0) {
|
||||||
|
pos += 4;
|
||||||
|
if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
|
||||||
|
auto index_str = name.substr(pos, pos1 - pos);
|
||||||
|
std::istringstream str(index_str);
|
||||||
|
int index; str >> index;
|
||||||
|
if (!str.fail()) return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
void IMatrixCollector::print_layer_importance() {
|
||||||
|
printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
|
||||||
|
if (m_layer_sim.empty()) return;
|
||||||
|
std::vector<std::pair<float, int>> layers;
|
||||||
|
layers.reserve(m_layer_sim.size());
|
||||||
|
for (int i = 0; i < int(m_layer_sim.size()); ++i) {
|
||||||
|
if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
|
||||||
|
}
|
||||||
|
if (layers.empty()) return;
|
||||||
|
std::sort(layers.begin(), layers.end());
|
||||||
|
printf("======================== sorted layer importances\n");
|
||||||
|
int j = 0;
|
||||||
|
for (auto& p : layers) {
|
||||||
|
int i = p.second;
|
||||||
|
printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
GGML_UNUSED(user_data);
|
GGML_UNUSED(user_data);
|
||||||
|
|
||||||
@@ -182,6 +221,45 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (m_collect_lsim) {
|
||||||
|
// We only need to do it here and not in the MoE branch above because the first tensor in a layer
|
||||||
|
// never is a MoE tensor
|
||||||
|
if (auto index = layer_index(src0->name); index.has_value()) {
|
||||||
|
if (*index != m_last_layer) {
|
||||||
|
if (*index > 0) {
|
||||||
|
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
|
||||||
|
printf("Oops: different size (%d vs %d)\n", (int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
|
||||||
|
auto& p = m_layer_sim[*index - 1];
|
||||||
|
auto x = m_last_input.data();
|
||||||
|
auto y = (const float *)data;
|
||||||
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
|
double sumxy = 0, sumx2 = 0, sumy2 = 0;
|
||||||
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
|
sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
|
||||||
|
}
|
||||||
|
double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
|
||||||
|
p.first += cos_sim;
|
||||||
|
p.second += 1;
|
||||||
|
x += src1->ne[0];
|
||||||
|
y += src1->ne[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m_last_layer = *index;
|
||||||
|
if (m_last_input.empty()) {
|
||||||
|
m_last_input.resize(src1->ne[0]*src1->ne[1]);
|
||||||
|
} else {
|
||||||
|
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
|
||||||
|
printf("Oops\n"); exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf("Copying src1 to m_last_input\n");
|
||||||
|
std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
auto & e = m_stats[wname];
|
auto & e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
@@ -622,7 +700,25 @@ int main(int argc, char ** argv) {
|
|||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
params.verbosity = 1;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
bool lsim = false;
|
||||||
|
//
|
||||||
|
// Do not pollute common with totally imatrix specific arguments as it was done in mainline.
|
||||||
|
// Instead, parse imatrix specific args here, push unknown args into a new array of args,
|
||||||
|
// and pass that to gpt_params_parse().
|
||||||
|
//
|
||||||
|
std::vector<char*> args;
|
||||||
|
args.reserve(argc);
|
||||||
|
args.push_back(argv[0]);
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
std::string arg{argv[i]};
|
||||||
|
if (arg == "-lsim" || arg == "--layer-similarity") {
|
||||||
|
lsim = true;
|
||||||
|
} else {
|
||||||
|
args.push_back(argv[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||||
print_usage(argc, argv, params);
|
print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -630,6 +726,7 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
g_collector.set_params(params);
|
g_collector.set_params(params);
|
||||||
|
g_collector.set_collect_lsim(lsim);
|
||||||
|
|
||||||
for (const auto & in_file : params.in_files) {
|
for (const auto & in_file : params.in_files) {
|
||||||
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||||
@@ -680,6 +777,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
g_collector.print_layer_importance();
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user