diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 3e25263f..f09d402f 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -67,26 +67,23 @@ std::pair, std::vector> split_tensor(const ggml_tensor int n_per_row = layer->ne[0]; int nrows = nelements/n_per_row; - std::vector> sumv(n_per_row); - for (int j = 0; j < n_per_row; ++j) sumv[j] = {0.f, j}; - - for (int row = 0; row < nrows; ++row) { - auto x = input_scratch_ptr + row*n_per_row; - for (int j = 0; j < n_per_row; ++j) sumv[j].first += x[j]*x[j]; + const float * imatrix_data = nullptr; + if (auto it = imatrix.find(layer->name); it != imatrix.end() && int(it->second.size()) == n_per_row) { + imatrix_data = it->second.data(); } - auto it = imatrix.find(layer->name); - bool have_imatrix = false; - if (it != imatrix.end() && int(it->second.size()) == n_per_row) { - have_imatrix = true; - for (int j = 0; j < n_per_row; ++j) sumv[j].first *= it->second[j]; + std::vector order(n_per_row); + if (!iqk_reorder(layer, imatrix_data, order.data())) { + return {}; } - std::sort(sumv.begin(), sumv.end(), std::greater>{}); int nblock = n_per_row/256; int nblock_high = int(nblock*0.1f + 0.5f); if (nblock_high == 0) return {}; + std::sort(order.data(), order.data() + 256*nblock_high); + std::sort(order.data() + 256*nblock_high, order.data() + 256*nblock); + std::vector part1(256*nblock_high*nrows); std::vector part2(256*(nblock-nblock_high)*nrows); @@ -94,8 +91,8 @@ std::pair, std::vector> split_tensor(const ggml_tensor auto x = input_scratch_ptr + row*n_per_row; auto yh = part1.data() + 256*nblock_high*row; auto yl = part2.data() + 256*(nblock-nblock_high)*row; - for (int j = 0; j < 256*nblock_high; ++j) yh[j] = x[sumv[j].second]; - for (int j = 256*nblock_high; j < 256*nblock; ++j) yl[j-256*nblock_high] = x[sumv[j].second]; + for (int j = 0; j < 256*nblock_high; ++j) yh[j] = x[order[j]]; + for (int j = 256*nblock_high; j < 256*nblock; ++j) yl[j-256*nblock_high] = x[order[j]]; } return std::make_pair(std::move(part1), std::move(part2)); @@ -721,9 +718,14 @@ int main(int argc, char ** argv) { auto h_type = get_better_type(type); auto h_qfns = ggml_internal_get_type_traits(h_type); if (!h_qfns.from_float || !h_qfns.to_float) continue; + std::string name1{kv_tensor.second->name}, name2(name1); + name1 += "_part1"; + name2 += "_part2"; ggml_tensor part1, part2; - snprintf(part1.name, 64, "%s_part1", kv_tensor.second->name); - snprintf(part2.name, 64, "%s_part2", kv_tensor.second->name); + std::memcpy(part1.name, name1.data(), name1.size() < 64 ? name1.size() + 1 : 64); + std::memcpy(part2.name, name2.data(), name2.size() < 64 ? name2.size() + 1 : 64); + //snprintf(part1.name, 64, "%s_part1", kv_tensor.second->name); + //snprintf(part2.name, 64, "%s_part2", kv_tensor.second->name); auto nrows = ggml_nrows(kv_tensor.second); part1.ne[0] = part_h.size()/nrows; part1.ne[1] = part_h.size()/part1.ne[0]; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 3ff6b4da..dc92488b 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace { @@ -2166,3 +2167,82 @@ void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) { #endif } + +namespace { +inline void add_x2(int n, float * sumx2, const float * x) { + for (int j = 0; j < n; ++j) sumx2[j] += x[j]*x[j]; +} +inline void add_x2(int n, float * sumx2, const ggml_half * x) { + for (int j = 0; j < n; ++j) { + float v = GGML_FP16_TO_FP32(x[j]); + sumx2[j] += v*v; + } +} +inline void add_x2(int n, float * sumx2, const ggml_bf16_t * x) { + const uint16_t * ux = (const uint16_t *)x; + typedef union { uint32_t u; float f; } helper_t; + helper_t h; + for (int j = 0; j < n; ++j) { + h.u = (uint32_t)ux[j] << 16; + sumx2[j] += h.f*h.f; + } +} +} + +bool iqk_reorder(const ggml_tensor * t, const float * imatrix, uint16_t * order) { + if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) { + return false; + } + int n_per_row = t->ne[0]; + int nrows = ggml_nrows(t); + int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2)); + int chunk = 64; + std::vector sumx2(n_per_row, 0); + auto compute = [&sumx2, n_per_row, nrows, chunk, max_thread, t] (int ith) { + const char * cx0 = (const char *)t->data; + for (int i = ith*chunk; i < n_per_row; i += max_thread*chunk) { + auto y = sumx2.data() + i; + auto cx = cx0 + i*t->nb[0]; + int n = i + chunk <= n_per_row ? chunk : n_per_row - chunk; + if (t->type == GGML_TYPE_F32) { + const float * x = (const float *)cx; + for (int row = 0; row < nrows; ++row) { + add_x2(n, y, x); + x += t->ne[0]; + } + } + else if (t->type == GGML_TYPE_F16) { + const ggml_half * x = (const ggml_half *)cx; + for (int row = 0; row < nrows; ++row) { + add_x2(n, y, x); + x += t->ne[0]; + } + } + else { + const ggml_bf16_t * x = (const ggml_bf16_t *)cx; + for (int row = 0; row < nrows; ++row) { + add_x2(n, y, x); + x += t->ne[0]; + } + } + for (int row = 0; row < n_per_row; ++row) { + cx += t->nb[1]; + } + } + }; + std::vector workers(max_thread-1); + int ith = 0; + for (auto& w : workers) w = std::thread(compute, ith++); + compute(ith); + for (auto& w : workers) w.join(); + + if (imatrix) { + for (int j = 0; j < n_per_row; ++j) sumx2[j] *= imatrix[j]; + } + std::vector> sorted(n_per_row); + for (int j = 0; j < n_per_row; ++j) sorted[j] = {sumx2[j], j}; + std::sort(sorted.begin(), sorted.end(), std::greater>{}); + for (int j = 0; j < n_per_row; ++j) order[j] = sorted[j].second; + //std::sort(order, order + n_per_row); + return true; +} diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index e5c16fc9..7f449939 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -57,6 +57,9 @@ void vec_dot_iq1_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +struct ggml_tensor; +bool iqk_reorder(const struct ggml_tensor * t, const float * imatrix, uint16_t * order); + #ifdef __cplusplus } #endif