From 3dc4dffed5295418601053923486e343d86b9503 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 27 Aug 2025 08:03:47 +0300 Subject: [PATCH] Fix avx2 GEMM mess (v2) (#724) * This fixes confusion around Q8_0 on AVX2 * This does it for iq4_nl, including FA * This does it for iq4_nl on Zen4, but FA does not work * Slightly more clear * Adding forgotten q8_0_r8 to num_rows() --------- Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-quants.c | 6 +- ggml/src/ggml.c | 7 +- ggml/src/iqk/fa/iqk_fa_templates.h | 5 - ggml/src/iqk/iqk_common.h | 10 ++ ggml/src/iqk/iqk_gemm_legacy_quants.cpp | 143 +++++++++++++++++------- ggml/src/iqk/iqk_mul_mat.cpp | 3 +- 6 files changed, 114 insertions(+), 60 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1cff076e..20a9831b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5562,11 +5562,7 @@ void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * r void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { #if GGML_USE_IQK_MULMAT -#ifdef HAVE_FANCY_SIMD - enum ggml_type dot_type = GGML_TYPE_Q8_1_X4; -#else - enum ggml_type dot_type = GGML_TYPE_Q8_0_X4; -#endif + enum ggml_type dot_type = GGML_TYPE_Q8_2_X4; if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, dot_type, vy, by, s, bs, 0, 1)) { return; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ed5c448b..d671d539 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -856,10 +856,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, #if GGML_USE_IQK_MULMAT -#ifdef HAVE_FANCY_SIMD - // Remember: we cannot add 128 to the Q8 quants and use iblock sum in Q8_1 to subtract as we do on Zen4 for pure AVX2 - // because there the result of the _mm256_maddubs_epi16() instruction may overflow the int16_t range - // (and it gets satured if it does), leading to wrong results. +#ifdef __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, @@ -1314,7 +1311,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, .vec_dot = ggml_vec_dot_iq4_nl_q8_0, -#if defined HAVE_FANCY_SIMD +#if __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, diff --git a/ggml/src/iqk/fa/iqk_fa_templates.h b/ggml/src/iqk/fa/iqk_fa_templates.h index 1971c472..614a2936 100644 --- a/ggml/src/iqk/fa/iqk_fa_templates.h +++ b/ggml/src/iqk/fa/iqk_fa_templates.h @@ -615,13 +615,8 @@ struct HelperIQ4nl final : public BaseHelper { constexpr static int block_size_q = QK8_0; #else HelperIQ4nl(const char * data, int stride) : Base(data, stride) {} -#ifdef HAVE_FANCY_SIMD using block_q8 = block_q8_2; constexpr static int block_size_q = QK8_2; -#else - using block_q8 = block_q8_0; - constexpr static int block_size_q = QK8_0; -#endif #endif // Needed for v * softmax(k * q) diff --git a/ggml/src/iqk/iqk_common.h b/ggml/src/iqk/iqk_common.h index c38dc863..474db15e 100644 --- a/ggml/src/iqk/iqk_common.h +++ b/ggml/src/iqk/iqk_common.h @@ -148,6 +148,16 @@ typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& inf funcs[6] = kernel;\ funcs[7] = kernel;\ +#define IQK_SET_MUL_MAT_FUNCTIONS_T2(kernel, Dequantizer, Block, funcs) \ + funcs[0] = kernel;\ + funcs[1] = kernel;\ + funcs[2] = kernel;\ + funcs[3] = kernel;\ + funcs[4] = kernel;\ + funcs[5] = kernel;\ + funcs[6] = kernel;\ + funcs[7] = kernel;\ + #define IQK_SET_MUL_MAT_FUNCTIONS(kernel, funcs) \ funcs[0] = kernel<1>;\ funcs[1] = kernel<2>;\ diff --git a/ggml/src/iqk/iqk_gemm_legacy_quants.cpp b/ggml/src/iqk/iqk_gemm_legacy_quants.cpp index 03128319..28fc7e67 100644 --- a/ggml/src/iqk/iqk_gemm_legacy_quants.cpp +++ b/ggml/src/iqk/iqk_gemm_legacy_quants.cpp @@ -79,6 +79,16 @@ template struct Sum4q4 { inline __m256i compute(__m256i x, __m256i y) const { return _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(x, y)); } }; +inline __m256 convert_scales(const uint16_t * scales) { + auto aux_d = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16)); + auto aux_m = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(scales+4)))); + return _mm256_set_m128(_mm_mul_ps(aux_d, aux_m), aux_d); +} + +inline __m128 convert_scales_s(const uint16_t * scales) { + return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16)); +} + struct ScaleHelperQ8_0 { inline __m128 prepare4(const block_q8_0 * y) { const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y; @@ -106,6 +116,20 @@ struct ScaleHelperQ_0 { template inline float prepare1(float d, const Q * y) const { return d*prepare1(y); } }; +struct ScaleHelperQ8_2S { + template + inline __m128 prepare4(const Q * y) { + const block_q8_2_x4 * y4 = (const block_q8_2_x4 *)y; + return convert_scales_s((const uint16_t *)y4->d); + } + template + inline __m128 prepare4(__m128 other_scales, const Q * y) { + return _mm_mul_ps(other_scales, prepare4(y)); + } + template static inline float prepare1(const Q * y) { return GGML_BF16_TO_FP32(ggml_bf16_t{y->d}); } + template static inline float prepare1(float d, const Q * y) { return d*prepare1(y); } +}; + struct ScaleHelperQ_0_MXFP4 { float scales[4]; template @@ -188,12 +212,6 @@ struct ScaleHelperQ8_1 { } }; -inline __m256 convert_scales(const uint16_t * scales) { - auto aux_d = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16)); - auto aux_m = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(scales+4)))); - return _mm256_set_m128(_mm_mul_ps(aux_d, aux_m), aux_d); -} - struct ScaleHelperQ8_2 { template inline __m256 prepare4(const Q * y) { @@ -348,6 +366,7 @@ using AccumType1 = AccumT, nrc_y, is_multiple_of_4>; using Sum4TypeQ80 = Sum4; using Sum4TypeQ82 = Sum4; +using Sum4TypeQ82S = Sum4; template void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { @@ -374,10 +393,35 @@ void mul_mat_qX_q8_Helper_x2(int nb, const void * vx, size_t bx, const DataInfo& } } -template +template void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { assert(n%Unpacker::block_size() == 0); - Q8 q8(info); + Q8 q8(info); + int nb = n/Unpacker::block_size(); + if constexpr (std::is_same_v) { + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ8_2S, Block, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ8_2S, Block, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x); + } + } + else { + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ8_0, Block, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ8_0, Block, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x); + } + } +} + +template +void mul_mat_qX_0_q8_2_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8 q8(info); int nb = n/Unpacker::block_size(); if (nb%4 == 0) { mul_mat_qX_q8_Helper, ScaleHelperQ8_0, block_q8_0, nrc_y>( @@ -393,11 +437,11 @@ void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info template void mul_mat_qX_0_q8_0_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) { static_assert(8%nrc_y == 0); - Q8 q8(info); + Q8 q8(info); int nb = n/Unpacker::block_size(); Unpacker unp(vx, bx); typename Unpacker::Sum4T sum4; - ScaleHelperQ8_0 scales; + ScaleHelperQ8_2S scales; __m256 result[8]; auto store = [&info, &result] (int ix0) { if constexpr (nrc_y == 1) { @@ -549,19 +593,15 @@ struct Q4_0_1_Dequantizer { } }; -struct IQ4_NL_Dequantizer { +struct IQ4_NL_DequantizerU { Dequantizer4bit b4; -#ifdef HAVE_FANCY_SIMD const __m256i values = load_iq4nl_values_256(); -#else - const __m256i values = load_iq4k_values_256(); -#endif inline __m256i dequant(const block_iq4_nl * x) const { return _mm256_shuffle_epi8(values, b4.dequant(x->qs)); } }; -struct IQ4_NL0_Dequantizer { +struct IQ4_NL_DequantizerS { Dequantizer4bit b4; const __m256i values = load_iq4k_values_256(); inline __m256i dequant(const block_iq4_nl * x) const { @@ -705,7 +745,7 @@ struct Q_Unpacker { struct Q8_0_Unpacker final : public Q_Unpacker { Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ80; + using Sum4T = Sum4TypeQ82S; inline static int block_size() { return QK8_0; } }; struct Q8_0_1_Unpacker final : public Q_Unpacker, Q8_0_1_Dequantizer> { @@ -713,6 +753,11 @@ struct Q8_0_1_Unpacker final : public Q_Unpacker { + Q8_0_2_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + using Sum4T = Sum4TypeQ82; + inline static int block_size() { return QK8_0; } +}; struct Q4_0_Unpacker final : public Q_Unpacker { Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ80; @@ -729,19 +774,16 @@ struct MXFP4_Unpacker final : public Q_Unpacker, IQ4_NL_Dequantizer> { - IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} +struct IQ4_NL_UnpackerU final : public Q_Unpacker, IQ4_NL_DequantizerU> { + IQ4_NL_UnpackerU(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ82; inline static int block_size() { return QK4_NL; } }; -#else -struct IQ4_NL_Unpacker final : public Q_Unpacker { - IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - using Sum4T = Sum4TypeQ80; +struct IQ4_NL_UnpackerS final : public Q_Unpacker { + IQ4_NL_UnpackerS(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + using Sum4T = Sum4TypeQ82S; inline static int block_size() { return QK4_NL; } }; -#endif struct Q5_0_Unpacker final : public Q_Unpacker { Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} using Sum4T = Sum4TypeQ80; @@ -1872,19 +1914,20 @@ void iqk_convert_qX_1_q8_1_r8(int n, const void * vx, size_t bx, void * vy, int } template void set_functions(std::array& funcs) { - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { + if constexpr (std::is_same_v || std::is_same_v) { IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_0_q8_0_T, Dequantizer, funcs) } + else if constexpr (std::is_same_v) { + IQK_SET_MUL_MAT_FUNCTIONS_T2(mul_mat_qX_0_q8_0_T, Dequantizer, block_q8_2, funcs) + } else if constexpr (std::is_same_v || std::is_same_v) { IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs) } - else if constexpr (std::is_same_v) { -#ifdef HAVE_FANCY_SIMD + else if constexpr (std::is_same_v) { IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs) -#else - IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_0_q8_0_T, Dequantizer, funcs) -#endif + } + else if constexpr (std::is_same_v) { + IQK_SET_MUL_MAT_FUNCTIONS_T2(mul_mat_qX_0_q8_0_T, Dequantizer, block_q8_2, funcs) } else if constexpr (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || @@ -1902,7 +1945,7 @@ bool iqk_convert_legacy_quants_q8_r8(int type, int n, const void * vx, size_t bx case GGML_TYPE_Q5_0 : iqk_convert_qX_q80_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_Q5_1 : iqk_convert_qX_1_q8_1_r8>(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_Q6_0 : iqk_convert_qX_q80_r8(n, vx, bx, vy, nrc_x); break; - case GGML_TYPE_IQ4_NL: iqk_convert_qX_q80_r8(n, vx, bx, vy, nrc_x); break; + case GGML_TYPE_IQ4_NL: iqk_convert_qX_q80_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_Q8_0 : iqk_convert_q80_q80_r8(n, vx, bx, vy, nrc_x); break; case GGML_TYPE_MXFP4 : iqk_convert_qX_q80_r8(n, vx, bx, vy, nrc_x); break; default: return false; @@ -1939,20 +1982,17 @@ bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array(kernels); #else set_functions(kernels); - expected_typeB = GGML_TYPE_Q8_0_X4; #endif break; case GGML_TYPE_IQ4_NL: - set_functions(kernels); -#ifndef HAVE_FANCY_SIMD - expected_typeB = GGML_TYPE_Q8_0_X4; +#ifdef HAVE_FANCY_SIMD + set_functions(kernels); +#else + set_functions(kernels); #endif break; case GGML_TYPE_MXFP4: set_functions(kernels); -//#ifndef HAVE_FANCY_SIMD -// expected_typeB = GGML_TYPE_Q8_0_X4; -//#endif break; case GGML_TYPE_Q4_0_R8: IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q4_0_r8_q8_2, kernels) @@ -3223,6 +3263,19 @@ inline std::pair mul_mat_kernel(int int_typeA, int nq) { case 7: return std::make_pair(mul_mat, 7>, 7);\ }\ } +#define MAKE_FUNCS2(mul_mat, block, n) \ + if (n >= kMaxQ) return std::make_pair(mul_mat, kMaxQ, block>, kMaxQ);\ + else {\ + switch (n) {\ + case 1: return std::make_pair(mul_mat, 1, block>, 1);\ + case 2: return std::make_pair(mul_mat, 2, block>, 2);\ + case 3: return std::make_pair(mul_mat, 3, block>, 3);\ + case 4: return std::make_pair(mul_mat, 4, block>, 4);\ + case 5: return std::make_pair(mul_mat, 5, block>, 5);\ + case 6: return std::make_pair(mul_mat, 6, block>, 6);\ + case 7: return std::make_pair(mul_mat, 7, block>, 7);\ + }\ + } #define MAKE_FUNCS_ONLY_NRC(mul_mat, n) \ if (n >= kMaxQ) return std::make_pair(mul_mat, kMaxQ);\ else {\ @@ -3249,7 +3302,11 @@ inline std::pair mul_mat_kernel(int int_typeA, int nq) { if (nq == 1) return std::make_pair(mul_mat_qX_0_q8_0_Tx, 1); if (nq == 2) return std::make_pair(mul_mat_qX_0_q8_0_Tx, 2); if (nq == 4) return std::make_pair(mul_mat_qX_0_q8_0_Tx, 4); - MAKE_FUNCS(mul_mat_qX_0_q8_0_T, 3); + if (nq == 5) return std::make_pair(mul_mat_qX_0_q8_0_T, 5); + if (nq == 6) return std::make_pair(mul_mat_qX_0_q8_0_T, 6); + if (nq == 7) return std::make_pair(mul_mat_qX_0_q8_0_T, 7); + return std::make_pair(mul_mat_qX_0_q8_0_T, kMaxQ); #endif #endif } @@ -3293,9 +3350,9 @@ inline std::pair mul_mat_kernel(int int_typeA, int nq) { MAKE_FUNCS(mul_mat_qX_0_q8_0= 32 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_Q5_1 : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type; case GGML_TYPE_Q6_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; -#ifdef HAVE_FANCY_SIMD case GGML_TYPE_IQ4_NL : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; -#endif case GGML_TYPE_MXFP4 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_Q8_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type; case GGML_TYPE_IQ1_KT : return nrc_y >= 16 ? GGML_TYPE_Q8_0_R8 : type; @@ -351,6 +349,7 @@ struct MulMat { case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_K_R8: return 8; case GGML_TYPE_Q4_0_R8: + case GGML_TYPE_Q8_0_R8: case GGML_TYPE_Q8_K_R16: case GGML_TYPE_BF16_R16: return 16; default: return 1;