diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 3e25263f..f09d402f 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -67,26 +67,23 @@ std::pair<std::vector<float>, std::vector<float>> split_tensor(const ggml_tensor
     int n_per_row = layer->ne[0];
     int nrows  = nelements/n_per_row;
 
-    std::vector<std::pair<float,int>> sumv(n_per_row);
-    for (int j = 0; j < n_per_row; ++j) sumv[j] = {0.f, j};
-
-    for (int row = 0; row < nrows; ++row) {
-        auto x = input_scratch_ptr + row*n_per_row;
-        for (int j = 0; j < n_per_row; ++j) sumv[j].first += x[j]*x[j];
+    const float * imatrix_data = nullptr;
+    if (auto it = imatrix.find(layer->name); it != imatrix.end() && int(it->second.size()) == n_per_row) {
+        imatrix_data = it->second.data();
     }
 
-    auto it = imatrix.find(layer->name);
-    bool have_imatrix = false;
-    if (it != imatrix.end() && int(it->second.size()) == n_per_row) {
-        have_imatrix = true;
-        for (int j = 0; j < n_per_row; ++j) sumv[j].first *= it->second[j];
+    std::vector<uint16_t> order(n_per_row);
+    if (!iqk_reorder(layer, imatrix_data, order.data())) {
+        return {};
     }
-    std::sort(sumv.begin(), sumv.end(), std::greater<std::pair<float,int>>{});
 
     int nblock = n_per_row/256;
     int nblock_high = int(nblock*0.1f + 0.5f);
     if (nblock_high == 0) return {};
 
+    std::sort(order.data(), order.data() + 256*nblock_high);
+    std::sort(order.data() + 256*nblock_high, order.data() + 256*nblock);
+
     std::vector<float> part1(256*nblock_high*nrows);
     std::vector<float> part2(256*(nblock-nblock_high)*nrows);
 
@@ -94,8 +91,8 @@ std::pair<std::vector<float>, std::vector<float>> split_tensor(const ggml_tensor
         auto x = input_scratch_ptr + row*n_per_row;
         auto yh = part1.data() + 256*nblock_high*row;
         auto yl = part2.data() + 256*(nblock-nblock_high)*row;
-        for (int j = 0; j < 256*nblock_high; ++j) yh[j] = x[sumv[j].second];
-        for (int j = 256*nblock_high; j < 256*nblock; ++j) yl[j-256*nblock_high] = x[sumv[j].second];
+        for (int j = 0; j < 256*nblock_high; ++j) yh[j] = x[order[j]];
+        for (int j = 256*nblock_high; j < 256*nblock; ++j) yl[j-256*nblock_high] = x[order[j]];
     }
 
     return std::make_pair(std::move(part1), std::move(part2));
@@ -721,9 +718,14 @@ int main(int argc, char ** argv) {
                     auto h_type = get_better_type(type);
                     auto h_qfns = ggml_internal_get_type_traits(h_type);
                     if (!h_qfns.from_float || !h_qfns.to_float) continue;
+                    std::string name1{kv_tensor.second->name}, name2(name1);
+                    name1 += "_part1";
+                    name2 += "_part2";
                     ggml_tensor part1, part2;
-                    snprintf(part1.name, 64, "%s_part1", kv_tensor.second->name);
-                    snprintf(part2.name, 64, "%s_part2", kv_tensor.second->name);
+                    std::memcpy(part1.name, name1.data(), name1.size() < 64 ? name1.size() + 1 : 64);
+                    std::memcpy(part2.name, name2.data(), name2.size() < 64 ? name2.size() + 1 : 64);
+                    //snprintf(part1.name, 64, "%s_part1", kv_tensor.second->name);
+                    //snprintf(part2.name, 64, "%s_part2", kv_tensor.second->name);
                     auto nrows = ggml_nrows(kv_tensor.second);
                     part1.ne[0] = part_h.size()/nrows;
                     part1.ne[1] = part_h.size()/part1.ne[0];
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 3ff6b4da..dc92488b 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -20,6 +20,7 @@
 #include <array>
 #include <algorithm>
 #include <cstring>
+#include <thread>
 
 namespace {
 
@@ -2166,3 +2167,82 @@ void iqk_quantize_row_q8_K(const float * x, void * vy, int64_t k) {
 #endif
 
 }
+
+namespace {
+inline void add_x2(int n, float * sumx2, const float * x) {
+    for (int j = 0; j < n; ++j) sumx2[j] += x[j]*x[j];
+}
+inline void add_x2(int n, float * sumx2, const ggml_half * x) {
+    for (int j = 0; j < n; ++j) {
+        float v = GGML_FP16_TO_FP32(x[j]);
+        sumx2[j] += v*v;
+    }
+}
+inline void add_x2(int n, float * sumx2, const ggml_bf16_t * x) {
+    const uint16_t * ux = (const uint16_t *)x;
+    typedef union { uint32_t u; float f; } helper_t;
+    helper_t h;
+    for (int j = 0; j < n; ++j) {
+        h.u = (uint32_t)ux[j] << 16;
+        sumx2[j] += h.f*h.f;
+    }
+}
+}
+
+bool iqk_reorder(const ggml_tensor * t, const float * imatrix, uint16_t * order) {
+    if (!ggml_is_contiguous(t) || (t->type != GGML_TYPE_F32 && t->type != GGML_TYPE_F16 && t->type != GGML_TYPE_BF16)) {
+        return false;
+    }
+    int n_per_row = t->ne[0];
+    int nrows = ggml_nrows(t);
+    int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
+    int chunk = 64;
+    std::vector<float> sumx2(n_per_row, 0);
+    auto compute = [&sumx2, n_per_row, nrows, chunk, max_thread, t] (int ith) {
+        const char * cx0 = (const char *)t->data;
+        for (int i = ith*chunk; i < n_per_row; i += max_thread*chunk) {
+            auto y = sumx2.data() + i;
+            auto cx = cx0 + i*t->nb[0];
+            int n = i + chunk <= n_per_row ? chunk : n_per_row - chunk;
+            if (t->type == GGML_TYPE_F32) {
+                const float * x = (const float *)cx;
+                for (int row = 0; row < nrows; ++row) {
+                    add_x2(n, y, x);
+                    x += t->ne[0];
+                }
+            }
+            else if (t->type == GGML_TYPE_F16) {
+                const ggml_half * x = (const ggml_half *)cx;
+                for (int row = 0; row < nrows; ++row) {
+                    add_x2(n, y, x);
+                    x += t->ne[0];
+                }
+            }
+            else {
+                const ggml_bf16_t * x = (const ggml_bf16_t *)cx;
+                for (int row = 0; row < nrows; ++row) {
+                    add_x2(n, y, x);
+                    x += t->ne[0];
+                }
+            }
+            for (int row = 0; row < n_per_row; ++row) {
+                cx += t->nb[1];
+            }
+        }
+    };
+    std::vector<std::thread> workers(max_thread-1);
+    int ith = 0;
+    for (auto& w : workers) w = std::thread(compute, ith++);
+    compute(ith);
+    for (auto& w : workers) w.join();
+
+    if (imatrix) {
+        for (int j = 0; j < n_per_row; ++j) sumx2[j] *= imatrix[j];
+    }
+    std::vector<std::pair<float, int>> sorted(n_per_row);
+    for (int j = 0; j < n_per_row; ++j) sorted[j] = {sumx2[j], j};
+    std::sort(sorted.begin(), sorted.end(), std::greater<std::pair<float, int>>{});
+    for (int j = 0; j < n_per_row; ++j) order[j] = sorted[j].second;
+    //std::sort(order, order + n_per_row);
+    return true;
+}
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index e5c16fc9..7f449939 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -57,6 +57,9 @@ void   vec_dot_iq1_tn_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void
 
 void iqk_quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 
+struct ggml_tensor;
+bool iqk_reorder(const struct ggml_tensor * t, const float * imatrix, uint16_t * order);
+
 #ifdef __cplusplus
 }
 #endif