mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-22 15:39:23 +00:00
q8_k_r16: iq4_xs now uses q8_k_r16 on Zen4+
PP performance is about the same as using q8_k_r8 on the Ryzen-7950X, so we expect nice gains on Zen5, and we don't need to wory about using 2 different q8_k_r8 implementations for fancy SIMD.
This commit is contained in:
@@ -1859,6 +1859,50 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_k_r16_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%16 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
__m512 acc[nrc_y] = {};
|
||||
__m512i isum[nrc_y] = {};
|
||||
__m512i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 16) {
|
||||
const block_q8_k_r16 * iq16 = (const block_q8_k_r16 *)((const char *)vx + ix*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto d4 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)iq16[ibl].d));
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) {
|
||||
qx[0] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+0);
|
||||
qx[1] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+1);
|
||||
qx[2] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+2);
|
||||
qx[3] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+3);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
|
||||
auto y256 = MM256_SET_M128I(y128, y128);
|
||||
auto y = _mm512_inserti32x8(_mm512_castsi256_si512(y256), y256, 1);
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[0], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x00)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[1], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x55)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[2], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xaa)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[3], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xff)));
|
||||
}
|
||||
}
|
||||
auto m4 = _mm512_mul_ps(d4, _mm512_set1_ps(-128.f));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto d4y = _mm512_mul_ps(d4, _mm512_set1_ps(q8.scale(iy, ibl)));
|
||||
acc[iy] = _mm512_fmadd_ps(d4y, _mm512_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
acc[iy] = _mm512_fmadd_ps(m4, _mm512_set1_ps(q8.y[iy][ibl].sum), acc[iy]);
|
||||
isum[iy] = _mm512_setzero_si512();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
info.store(ix, iy, acc[iy]);
|
||||
acc[iy] = _mm512_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_KV_q8_KV(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
@@ -2516,6 +2560,7 @@ void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
}
|
||||
}
|
||||
|
||||
template <int nr = 8>
|
||||
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
__m256i qs[16];
|
||||
@@ -2561,9 +2606,9 @@ inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 64*ib32;
|
||||
auto qs = (uint32_t *)q8_k + 8*nr*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
qs[nr*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
@@ -2571,27 +2616,35 @@ inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16
|
||||
|
||||
// TODO: move this to iqk_gemm_iquants
|
||||
void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq4_xs * x8[8];
|
||||
const block_iq4_xs * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
auto values128 = _mm_loadu_si128((const __m128i *)iq4k_values);
|
||||
auto values = MM256_SET_M128I(values128, values128);
|
||||
|
||||
int16_t ls[16];
|
||||
float dnew[8];
|
||||
float dnew[k_nr];
|
||||
uint32_t block[8];
|
||||
__m256i xv[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
ls[2*ib32+0] = ls[2*ib32+1] = (((x8[k][i].scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((x8[k][i].scales_h >> 2*ib32) & 3) << 4)) - 32;
|
||||
@@ -2599,9 +2652,17 @@ void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
xv[ib32] = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(bits, 4), bits), _mm256_set1_epi8(0xf));
|
||||
xv[ib32] = _mm256_shuffle_epi8(values, xv[ib32]);
|
||||
}
|
||||
dnew[k] = d * convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
dnew[k] = d * convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_loadu_ps(dnew), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_loadu_ps(dnew), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@@ -2671,10 +2732,15 @@ bool iqk_set_kernels_kquants(int ne00, int typeA, int typeB, std::array<mul_mat_
|
||||
break;
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r8_q8_k, kernels)
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
func16 = mul_mat_q8_k_r8_q8_k<16>;
|
||||
#endif
|
||||
//#ifdef HAVE_FANCY_SIMD
|
||||
// func16 = mul_mat_q8_k_r8_q8_k<16>;
|
||||
//#endif
|
||||
break;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r16_q8_k, kernels)
|
||||
break;
|
||||
#endif
|
||||
case GGML_TYPE_Q8_KV:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_KV_q8_KV, kernels)
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
|
||||
@@ -231,31 +231,36 @@ struct MulMat {
|
||||
static bool prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny);
|
||||
static inline ggml_type is_dequant_better(ggml_type type, int nrc_y) {
|
||||
#ifdef __AVX2__
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto q8_k_type = GGML_TYPE_Q8_K_R16;
|
||||
#else
|
||||
auto q8_k_type = GGML_TYPE_Q8_K_R8;
|
||||
#endif
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q4_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q4_1 : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
@@ -346,7 +351,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q4_0_R8:
|
||||
case GGML_TYPE_Q8_0_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
@@ -381,6 +386,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
@@ -829,6 +835,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
return iqk_set_kernels_kquants(ne00, typeA, typeB, mm.funcs, mm.func16);
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
@@ -924,6 +931,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
return iqk_set_kernels_kquants(ne00, typeA, typeB, m.funcs, m.func16);
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
case GGML_TYPE_IQ2_K:
|
||||
|
||||
Reference in New Issue
Block a user