Apply platform specific modifications when repacking

On Zen4 we can pre-convert the signed quants in q8_0_r4 and q8_k_r8 to unsigned thus avoiding these operations in matrix multiplications. With this change we hit PP-512 = 382.40 t/s (q8_k_r8) PP-512 = 306.92 t/s (q8_0_r4) for L3-8B on a Ryzen-7950X using q8_0 KV-cache.
2026-02-24 23:24:13 +00:00 · 2025-01-27 11:59:30 +02:00
parent 8b3c66063f
commit f1c114d477
2 changed files with 75 additions and 17 deletions
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -2988,7 +2988,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
    int nb = n / QK8_0;
    GGML_ASSERT(nb%4 == 0);
    if constexpr (nrc_y == 1) {
-        auto m127 = _mm256_set1_epi8(127);
        __m256 acc[2] = {};
        __m256i qx[8];
        float d8[8];
@@ -2998,14 +2997,14 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
                _mm256_storeu_ps(d8, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)));
                for (int k = 0; k < 4; ++k) {
                    auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq8[4*ib4+k].d));
-                    qx[0] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0), m127);
-                    qx[1] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1), m127);
-                    qx[2] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2), m127);
-                    qx[3] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3), m127);
-                    qx[4] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4), m127);
-                    qx[5] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5), m127);
-                    qx[6] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6), m127);
-                    qx[7] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7), m127);
+                    qx[0] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0);
+                    qx[1] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1);
+                    qx[2] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2);
+                    qx[3] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3);
+                    qx[4] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4);
+                    qx[5] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5);
+                    qx[6] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6);
+                    qx[7] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7);
                    auto y4l = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+0);
                    auto y4h = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+1);
                    auto yl  = MM256_SET_M128I(y4l, y4l);
@@ -3031,7 +3030,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
        __m512  acc[2*nrc_y] = {};
        __m512i qx[8];
        float d8[8*nrc_y];
-        auto m127 = _mm512_set1_epi8(127);
        for (int ix = 0; ix < nrc_x; ix += 16) {
            const block_q8_0_r8 * q8l = (const block_q8_0_r8 *)((const char *)vx + (ix+0)*bx);
            const block_q8_0_r8 * q8h = (const block_q8_0_r8 *)((const char *)vx + (ix+8)*bx);
@@ -3046,7 +3044,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
                    for (int j = 0; j < 8; ++j) {
                        qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[4*ib4+k].qs+j)),
                                                                          _mm256_loadu_si256((const __m256i *)q8h[4*ib4+k].qs+j), 1);
-                        qx[j] = _mm512_add_epi8(qx[j], m127);
                    }
                    for (int iy = 0; iy < nrc_y; ++iy) {
                        auto y4l = _mm_loadu_si128((const __m128i*)q8.y[iy][ib4].qs+2*k+0);
@@ -5070,12 +5067,7 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
                qx[1] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1);
                qx[2] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2);
                qx[3] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3);
-#ifdef HAVE_FANCY_SIMD
-                qx[0] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+0), _mm256_set1_epi8(-128));
-                qx[1] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1), _mm256_set1_epi8(-128));
-                qx[2] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2), _mm256_set1_epi8(-128));
-                qx[3] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3), _mm256_set1_epi8(-128));
-#else
+#ifndef HAVE_FANCY_SIMD
                auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
                auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
                auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
@@ -13037,6 +13029,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
                m1 = _mm256_unpackhi_epi64(t0, t1);
                m2 = _mm256_unpacklo_epi64(t2, t3);
                m3 = _mm256_unpackhi_epi64(t2, t3);
+#ifdef HAVE_FANCY_SIMD
+                m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
+                m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
+                m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
+                m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
+#endif
                _mm256_storeu_si256((__m256i *)y[ib].qs + 0, m0);
                _mm256_storeu_si256((__m256i *)y[ib].qs + 1, m1);
                _mm256_storeu_si256((__m256i *)y[ib].qs + 2, m2);
@@ -13053,6 +13051,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
                m1 = _mm256_unpackhi_epi64(t0, t1);
                m2 = _mm256_unpacklo_epi64(t2, t3);
                m3 = _mm256_unpackhi_epi64(t2, t3);
+#ifdef HAVE_FANCY_SIMD
+                m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
+                m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
+                m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
+                m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
+#endif
                _mm256_storeu_si256((__m256i *)y[ib].qs + 4, m0);
                _mm256_storeu_si256((__m256i *)y[ib].qs + 5, m1);
                _mm256_storeu_si256((__m256i *)y[ib].qs + 6, m2);
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -43,6 +43,15 @@ constexpr int popcount(uint32_t x) { return __builtin_popcount(x); }
 constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); }
 #endif

+#if defined __x86_64__
+#if defined HAVE_FANCY_SIMD
+    #undef HAVE_FANCY_SIMD
+#endif
+#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
+    #define HAVE_FANCY_SIMD
+#endif
+#endif
+
 namespace {

 inline int nearest_int(float fval) {
@@ -3746,12 +3755,33 @@ static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8
                    y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
                }
            }
+#ifdef HAVE_FANCY_SIMD
+            if (online) {
+                for (int l = 0; l < 4; ++l) {
+                    auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
+                    _mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
+                }
+            }
+#endif
        }
        x += 8*nblock;
        y += nblock;
    }
 }

+#ifdef HAVE_FANCY_SIMD
+static void modify_q8_0_r4(int64_t k, char * cy) {
+    auto y = (block_iq4_nl_r8 *)cy;
+    int nb = k/(32*8);
+    for (int ib = 0; ib < nb; ++ib) {
+        for (int l = 0; l < 4; ++l) {
+            auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
+            _mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
+        }
+    }
+}
+#endif
+
 size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
    GGML_ASSERT(nrows%8 == 0);
    auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
@@ -5195,11 +5225,31 @@ static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8
                    for (int i = 0; i < 4; ++i) y[ibl].qs[32*ib + 4*k + i] = x8[k][ibl].qs[4*ib+i];
                }
            }
+#ifdef HAVE_FANCY_SIMD
+            if (online) {
+                for (int l = 0; l < 32; ++l) {
+                    auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ibl].qs + l), _mm512_set1_epi8(-128));
+                    _mm512_storeu_si512((__m512i *)y[ibl].qs + l, v);
+                }
+            }
+#endif
        }
        x += 8*nblock;
        y += nblock;
    }
 }
+#ifdef HAVE_FANCY_SIMD
+static void modify_q8_k_r8(int64_t k, char * cy) {
+    auto y = (block_q8_k_r8 *)cy;
+    int nb = k/(256*8);
+    for (int ib = 0; ib < nb; ++ib) {
+        for (int l = 0; l < 32; ++l) {
+            auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(-128));
+            _mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
+        }
+    }
+}
+#endif

 size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
    GGML_ASSERT(nrows%8 == 0);
@@ -6057,6 +6107,10 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) {
    static const std::unordered_map<ggml_type, Modify> k_mod_map = {
 #ifdef __ARM_NEON
        { GGML_TYPE_Q4_0_R4, {modify_q4_0_r4, 8} },
+#endif
+#ifdef HAVE_FANCY_SIMD
+        { GGML_TYPE_Q8_0_R4, {modify_q8_0_r4, 8} },
+        { GGML_TYPE_Q8_K_R8, {modify_q8_k_r8, 8} },
 #endif
    };
    auto it = k_mod_map.find(tensor->type);