From c106b466b6f9ce67ae4f822064a6dbbc39e27ac3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 5 Oct 2024 19:19:10 +0300 Subject: [PATCH] WIP --- examples/quantize-stats/CMakeLists.txt | 2 +- examples/quantize-stats/quantize-stats.cpp | 216 +++++++++++++++++++-- 2 files changed, 205 insertions(+), 13 deletions(-) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index bb986a71..58898345 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -2,5 +2,5 @@ set(TARGET llama-quantize-stats) add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) +target_include_directories(${TARGET} PRIVATE ../../common ../../ggml/src) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 214dc528..97459763 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -2,6 +2,9 @@ #include "common.h" #include "ggml.h" #include "llama.h" +#include "iqk/iqk_quantize.h" +#define GGML_COMMON_DECL_C +#include "ggml-common.h" #include #include @@ -17,6 +20,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -34,6 +38,112 @@ struct quantize_stats_params { std::vector include_types; }; +using Imatrix = std::unordered_map>; + +static void analyze_layer(const std::string & name, const ggml_tensor * layer, std::vector & input_scratch, + const Imatrix& imatrix) { + + if (layer->ne[0] == 1 || layer->ne[1] == 1) return; + + if (!ggml_is_contiguous(layer)) { + printf("%s: %s is not contiguous\n", __func__, layer->name); + return; + } + + int nelements = ggml_nelements(layer); + + float* input_scratch_ptr = nullptr; + if (layer->type == GGML_TYPE_F16) { + if ((int)input_scratch.size() < nelements) input_scratch.resize(nelements); + for (int i = 0; i < nelements; ++i) { + input_scratch[i] = ggml_get_f32_1d(layer, i); + } + input_scratch_ptr = input_scratch.data(); + } else { + input_scratch_ptr = ggml_get_data_f32(layer); + } + + int n_per_row = layer->ne[0]; + int nrows = nelements/n_per_row; + + std::vector> sumv(n_per_row); + for (int j = 0; j < n_per_row; ++j) sumv[j] = {0.f, j}; + + for (int row = 0; row < nrows; ++row) { + auto x = input_scratch_ptr + row*n_per_row; + for (int j = 0; j < n_per_row; ++j) sumv[j].first += x[j]*x[j]; + } + + auto it = imatrix.find(layer->name); + bool have_imatrix = false; + if (it != imatrix.end() && int(it->second.size()) == n_per_row) { + have_imatrix = true; + for (int j = 0; j < n_per_row; ++j) sumv[j].first *= it->second[j]; + } + std::sort(sumv.begin(), sumv.end(), std::greater>{}); + + printf("%s: %g %g %g\n", name.c_str(), sumv.front().first, sumv[n_per_row/2].first, sumv.back().first); + + int nblock = n_per_row/256; + int nblock_high = int(nblock*0.1f + 0.5f); + if (nblock_high == 0) return; + + std::vector part1(256*nblock_high*nrows); + std::vector part2(256*(nblock-nblock_high)*nrows); + + for (int row = 0; row < nrows; ++row) { + auto x = input_scratch_ptr + row*n_per_row; + auto yh = part1.data() + 256*nblock_high*row; + auto yl = part2.data() + 256*(nblock-nblock_high)*row; + for (int j = 0; j < 256*nblock_high; ++j) yh[j] = x[sumv[j].second]; + for (int j = 256*nblock_high; j < 256*nblock; ++j) yl[j-256*nblock_high] = x[sumv[j].second]; + } + + std::vector reordered_imatrix; + if (have_imatrix) { + reordered_imatrix.resize(256*nblock); + for (int j = 0; j < 256*nblock; ++j) reordered_imatrix[j] = it->second[sumv[j].second]; + } + + auto row_size_h = ggml_row_size(GGML_TYPE_IQ3_K, 256*nblock_high); + auto row_size_l = ggml_row_size(GGML_TYPE_IQ2_K, 256*(nblock-nblock_high)); + + std::vector qdata_h(row_size_h*nrows); + std::vector qdata_l(row_size_l*nrows); + + ggml_quantize_chunk(GGML_TYPE_IQ3_K, part1.data(), (void *)qdata_h.data(), 0, nrows, 256*nblock_high, reordered_imatrix.data()); + ggml_quantize_chunk(GGML_TYPE_IQ2_K, part2.data(), (void *)qdata_l.data(), 0, nrows, 256*(nblock-nblock_high), + have_imatrix ? reordered_imatrix.data() + 256*nblock_high : nullptr); + + std::vector deq_part1(256*nblock_high*nrows); + std::vector deq_part2(256*(nblock-nblock_high)*nrows); + + dequantize_row_iq3_k((const block_iq3_k *)qdata_h.data(), deq_part1.data(), deq_part1.size()); + dequantize_row_iq2_k((const block_iq2_k *)qdata_l.data(), deq_part2.data(), deq_part2.size()); + + double mse = 0, sumw = 0; + for (int row = 0; row < nrows; ++row) { + auto xh = part1.data() + 256*nblock_high*row; + auto xl = part2.data() + 256*(nblock-nblock_high)*row; + auto yh = deq_part1.data() + 256*nblock_high*row; + auto yl = deq_part2.data() + 256*(nblock-nblock_high)*row; + for (int j = 0; j < 256*nblock_high; ++j) { + float w = have_imatrix ? reordered_imatrix[j] : 1; + float diff = xh[j] - yh[j]; + mse += w*diff*diff; + sumw += w; + } + for (int j = 0; j < 256*(nblock-nblock_high); ++j) { + float w = have_imatrix ? reordered_imatrix[j+256*nblock_high] : 1; + float diff = xl[j] - yl[j]; + mse += w*diff*diff; + sumw += w; + } + } + printf(" rmse = %g\n", sqrt(mse/sumw)); + +} + constexpr size_t HISTOGRAM_BUCKETS = 150; constexpr double HISTOGRAM_RANGE = 0.03; @@ -254,6 +364,69 @@ static void test_roundtrip_on_layer(bool transpose, } } +static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, Imatrix& imatrix_data) { + std::ifstream in(imatrix_file.c_str(), std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); + exit(1); + } + int n_entries; + in.read((char *)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); + exit(1); + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); + exit(1); + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto & e = imatrix_data[name]; + int ncall; + in.read((char *)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n", __func__, i); + imatrix_data = {}; + exit(1); + } + e.resize(nval); + in.read((char *)e.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n", __func__, i); + imatrix_data = {}; + exit(1); + } + if (ncall > 0) { + for (auto& v : e) v /= ncall; + } + + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); + } + } + + // latest imatrix version contains the dataset filename at the end of the file + int m_last_call = 0; + if (in.peek() != EOF) { + in.read((char *)&m_last_call, sizeof(m_last_call)); + int dataset_len; + in.read((char *)&dataset_len, sizeof(dataset_len)); + std::vector dataset_as_vec(dataset_len); + in.read(dataset_as_vec.data(), dataset_len); + imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + } + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); + return m_last_call; +} + int main(int argc, char ** argv) { ggml_time_init(); @@ -263,7 +436,9 @@ int main(int argc, char ** argv) { int max_thread = 0; bool invalid_param = false; + bool analyze = false; std::string arg; + std::string imatrix_file; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -276,6 +451,8 @@ int main(int argc, char ** argv) { params.verbose = true; } else if (arg == "-p" || arg == "--per-layer-stats") { params.per_layer_stats = true; + } else if (arg == "-a" || arg == "--analyze") { + analyze = true; } else if (arg == "--transpose") { params.transpose = true; } else if (arg == "--histogram") { @@ -286,6 +463,12 @@ int main(int argc, char ** argv) { break; } params.model = argv[i]; + } else if (arg == "-im" || arg == "--imatrix") { + if (++i >= argc) { + invalid_param = true; + break; + } + imatrix_file = argv[i]; } else if (arg == "-l" || arg == "--include-layer") { if (++i >= argc) { invalid_param = true; @@ -332,6 +515,12 @@ int main(int argc, char ** argv) { return 1; } + Imatrix imatrix; + if (!imatrix_file.empty()) { + std::string dum; + load_imatrix(imatrix_file, dum, imatrix); + } + print_build_info(); // load the model @@ -433,18 +622,21 @@ int main(int argc, char ** argv) { } std::string layer_name { ggml_type_name(type) }; layer_name += "::" + kv_tensor.first; - test_roundtrip_on_layer(params.transpose, - layer_name, - params.per_layer_stats, - qfns, - params.reference, - kv_tensor.second, - input_scratch, - quantized_scratch, - output_scratch, - global_stats, - max_thread - ); + if (analyze) { + analyze_layer(layer_name, kv_tensor.second, input_scratch, imatrix); + } else { + test_roundtrip_on_layer(params.transpose, + layer_name, + params.per_layer_stats, + qfns, + params.reference, + kv_tensor.second, + input_scratch, + quantized_scratch, + output_scratch, + global_stats, + max_thread); + } } print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);