iq1_bn(Metal): 89.0 -> 94.7 t/s for TG-128

So, total improvement is ~15%. Not bad.
2026-02-25 15:44:10 +00:00 · 2024-10-26 10:15:43 +02:00
parent a5c3e8839c
commit 885a48b788
1 changed files with 4 additions and 6 deletions
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -5459,16 +5459,14 @@ void kernel_mul_mv_iq1_bn_f32_impl(

    device const float * y4 = (device const float *)y + 32 * ix + 16 * ir;

-    const float values[3] = {-1.f, 0.f, 1.f};
-
-    constexpr uint8_t k_mult[5] = {81, 27, 9, 3, 1};
+    constexpr uint16_t k_mult[5] = {81, 27, 9, 3, 1};

    const int ib  = ix % (QK_IQ1BN / 32);
    const int i16 = 2*ib + ir;

+    float sumy = 0;
    for (int ib32 = ix; ib32 < nb32; ib32 += 16) {

-        float sumy = 0;
        for (int j = 0; j < 16; ++j) { yl[j] = y4[j]; sumy += y4[j]; }

        const int ibl = ib32 / (QK_IQ1BN / 32);
@@ -5494,7 +5492,7 @@ void kernel_mul_mv_iq1_bn_f32_impl(
            v += v << 1;
            acc += yl[15] * (v & 0xff00);

-            sumf[row] += 0.00390625f * acc - sumy;
+            sumf[row] += acc;

            extra += row_size;
            ql    += row_size;
@@ -5504,7 +5502,7 @@ void kernel_mul_mv_iq1_bn_f32_impl(
    }

    for (int row = 0; row < N_DST; row += 2) {
-        float2 r = {sumf[row], sumf[row+1]};
+        float2 r = {0.00390625f * sumf[row] - sumy, 0.00390625 * sumf[row+1] - sumy};
        r = simd_sum(r);
        if (tiisg < 2) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row + tiisg] = r[tiisg] * scale[row + tiisg];