quantize-stats on transposed tensors

I always wanted to know if transposing the model tensors may improve quantization. If for whatever reason there was a correlation between weights in different rows but at the same position within a row, a transposed version of the tensor would quantize better. This commit tried and, nope, no luck.
2026-02-04 05:23:58 +00:00 · 2024-10-05 17:57:52 +03:00
parent c0ddc644bb
commit 403d4eef35
1 changed files with 45 additions and 16 deletions
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -28,6 +28,7 @@ struct quantize_stats_params {
    bool per_layer_stats = false;
    bool print_histogram = false;
    bool reference = false;
+    bool transpose = false;
    std::vector<std::string> include_layers;
    std::vector<std::string> exclude_layers;
    std::vector<enum ggml_type> include_types;
@@ -146,16 +147,17 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }

-static void test_roundtrip_on_chunk(
+static void test_roundtrip_on_chunk(bool fill_data,
    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
-    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
-) {
-    if (layer->type == GGML_TYPE_F16) {
-        for (int i = 0; i < chunk_size; i++) {
-            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats) {
+    if (fill_data) {
+        if (layer->type == GGML_TYPE_F16) {
+            for (int i = 0; i < chunk_size; i++) {
+                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+            }
+        } else {
+            input_scratch = ggml_get_data_f32(layer) + offset;
        }
-    } else {
-        input_scratch = ggml_get_data_f32(layer) + offset;
    }

    if (use_reference) {
@@ -170,20 +172,45 @@ static void test_roundtrip_on_chunk(


 // Run quantization function for a single layer and update error stats
-static void test_roundtrip_on_layer(
+static void test_roundtrip_on_layer(bool transpose,
    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
-    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
-) {
+    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0) {
    assert(tensor_is_contiguous(layer));
    error_stats layer_error {};
    uint64_t nelements = ggml_nelements(layer);

    float* input_scratch_ptr = nullptr;
-    if (layer->type == GGML_TYPE_F16) {
+    if (transpose) {
+        if (layer->ne[2] > 1 || layer->ne[3] > 1 || layer->ne[1] < 256 || !ggml_is_contiguous(layer)) {
+            printf("%s: transpose option requires contiguous 2D tensor with >= 256 rows\n", __func__);
+            return;
+        }
        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        if (layer->type == GGML_TYPE_F16) {
+            const ggml_fp16_t * data = (const ggml_fp16_t *)layer->data;
+            for (int i = 0; i < layer->ne[0]; ++i) for (int j = 0; j < layer->ne[1]; ++j) {
+                input_scratch[i*layer->ne[1] + j] = ggml_fp16_to_fp32(data[j*layer->ne[0] + i]);
+            }
+        }
+        else if (layer->type == GGML_TYPE_F32) {
+            const float * data = (const float *)layer->data;
+            for (int i = 0; i < layer->ne[0]; ++i) for (int j = 0; j < layer->ne[1]; ++j) {
+                input_scratch[i*layer->ne[1] + j] = data[j*layer->ne[0] + i];
+            }
+        }
+        else {
+            printf("%s: unsupported type %s\n", __func__, ggml_type_name(layer->type));
+            return;
+        }
        input_scratch_ptr = input_scratch.data();
    }
+    else {
+        if (layer->type == GGML_TYPE_F16) {
+            if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+            input_scratch_ptr = input_scratch.data();
+        }
+    }
    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
    if (output_scratch.size() < nelements) output_scratch.resize(nelements);

@@ -192,14 +219,14 @@ static void test_roundtrip_on_layer(
    int num_chunks = (nelements + chunk_size - 1)/chunk_size;

    if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+        test_roundtrip_on_chunk(!transpose, layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
                output_scratch.data(), print_layer_stats ? layer_error : total_error);
    } else {
        auto & stats = print_layer_stats ? layer_error : total_error;
        std::mutex mutex;
        uint64_t counter = 0;
        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
-             &quantized_scratch, &output_scratch, chunk_size] () {
+             &quantized_scratch, &output_scratch, chunk_size, transpose] () {
            error_stats local_stats {};
            while (true) {
                std::unique_lock<std::mutex> lock(mutex);
@@ -210,7 +237,7 @@ static void test_roundtrip_on_layer(
                }
                lock.unlock();
                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                test_roundtrip_on_chunk(!transpose, layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
            }
        };
@@ -249,6 +276,8 @@ int main(int argc, char ** argv) {
            params.verbose = true;
        } else if (arg == "-p" || arg == "--per-layer-stats") {
            params.per_layer_stats = true;
+        } else if (arg == "--transpose") {
+            params.transpose = true;
        } else if (arg == "--histogram") {
            params.print_histogram = true;
        } else if (arg == "-m" || arg == "--model") {
@@ -404,7 +433,7 @@ int main(int argc, char ** argv) {
                }
                std::string layer_name { ggml_type_name(type) };
                layer_name += "::" + kv_tensor.first;
-                test_roundtrip_on_layer(
+                test_roundtrip_on_layer(params.transpose,
                        layer_name,
                        params.per_layer_stats,
                        qfns,