This is better

This commit is contained in:
Iwan Kawrakow
2025-08-19 19:53:17 +03:00
parent 2572d16399
commit 5d39c132f2
4 changed files with 13 additions and 10 deletions

View File

@@ -1859,7 +1859,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.from_float_ref = (ggml_from_float_t)quantize_row_q8_0_r8_ref,
.vec_dot = vec_dot_q8_0_r8_q8_0,
#if GGML_USE_IQK_MULMAT
#if defined __AVX2__
#if defined HAVE_FANCY_SIMD
.vec_dot_type = GGML_TYPE_Q8_2_X4,
#else
.vec_dot_type = GGML_TYPE_Q8_0_X4,

View File

@@ -299,7 +299,7 @@ template <int D>
struct HelperQ80R8 : public BaseHelper {
using Base = BaseHelper;
constexpr static ggml_type type = GGML_TYPE_Q8_0_R8;
#ifdef __AVX2__
#ifdef HAVE_FANCY_SIMD
constexpr static int block_size_q = QK8_2;
using block_q8 = block_q8_2;
#else

View File

@@ -1613,7 +1613,7 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
template <int nrc_y>
static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%8 == 0);
Q8<nrc_y, block_q8_2_x4> q8(info);
Q8<nrc_y, block_q8_0_x4> q8(info);
auto m1 = _mm256_set1_epi16(1);
int nb = n / QK8_0;
__m256 acc[nrc_y] = {};
@@ -1636,7 +1636,7 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
const block_q8_0_r8 * iq8 = (const block_q8_0_r8 *)((const char *)vx + ix*bx);
for (int ib4 = 0; ib4 < nb/4; ++ib4) {
for (int iy = 0; iy < nrc_y; ++iy) {
auto scales = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d)), 16));
auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d));
_mm_storeu_ps(d8 + 4*iy, scales);
}
for (int k = 0; k < 4; ++k) {
@@ -1668,9 +1668,9 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
sx[j] = _mm256_sign_epi8(qx[j], qx[j]);
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto qy = (const block_q8_2 *)q8.y[iy];
auto qy = (const block_q8_0 *)q8.y[iy];
auto sumi = dot(qy[ib].qs);
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d})));
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)));
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
}
for (int j = 0; j < 4; ++j) {
@@ -1678,9 +1678,9 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
sx[j] = _mm256_sign_epi8(qx[j], qx[j]);
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto qy = (const block_q8_2 *)q8.y[iy];
auto qy = (const block_q8_0 *)q8.y[iy];
auto sumi = dot(qy[ib].qs+16);
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d})));
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)));
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
}
}
@@ -1968,6 +1968,9 @@ bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mu
break;
case GGML_TYPE_Q8_0_R8:
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_0_r8_q8_2, kernels)
#ifndef HAVE_FANCY_SIMD
expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
break;
case GGML_TYPE_IQ4_NL_R4:
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq4_nl_r4_q8_2, kernels)

View File

@@ -244,7 +244,7 @@ struct MulMat {
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
//case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
@@ -278,7 +278,7 @@ struct MulMat {
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
//case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;