mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-14 07:48:16 +00:00
This is better
This commit is contained in:
@@ -1859,7 +1859,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_q8_0_r8_ref,
|
||||
.vec_dot = vec_dot_q8_0_r8_q8_0,
|
||||
#if GGML_USE_IQK_MULMAT
|
||||
#if defined __AVX2__
|
||||
#if defined HAVE_FANCY_SIMD
|
||||
.vec_dot_type = GGML_TYPE_Q8_2_X4,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_0_X4,
|
||||
|
||||
@@ -299,7 +299,7 @@ template <int D>
|
||||
struct HelperQ80R8 : public BaseHelper {
|
||||
using Base = BaseHelper;
|
||||
constexpr static ggml_type type = GGML_TYPE_Q8_0_R8;
|
||||
#ifdef __AVX2__
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr static int block_size_q = QK8_2;
|
||||
using block_q8 = block_q8_2;
|
||||
#else
|
||||
|
||||
@@ -1613,7 +1613,7 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
Q8<nrc_y, block_q8_2_x4> q8(info);
|
||||
Q8<nrc_y, block_q8_0_x4> q8(info);
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
int nb = n / QK8_0;
|
||||
__m256 acc[nrc_y] = {};
|
||||
@@ -1636,7 +1636,7 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
|
||||
const block_q8_0_r8 * iq8 = (const block_q8_0_r8 *)((const char *)vx + ix*bx);
|
||||
for (int ib4 = 0; ib4 < nb/4; ++ib4) {
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto scales = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d)), 16));
|
||||
auto scales = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)q8.y[iy][ib4].d));
|
||||
_mm_storeu_ps(d8 + 4*iy, scales);
|
||||
}
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
@@ -1668,9 +1668,9 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
|
||||
sx[j] = _mm256_sign_epi8(qx[j], qx[j]);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto qy = (const block_q8_2 *)q8.y[iy];
|
||||
auto qy = (const block_q8_0 *)q8.y[iy];
|
||||
auto sumi = dot(qy[ib].qs);
|
||||
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d})));
|
||||
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)));
|
||||
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
@@ -1678,9 +1678,9 @@ static void mul_mat_q8_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
|
||||
sx[j] = _mm256_sign_epi8(qx[j], qx[j]);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto qy = (const block_q8_2 *)q8.y[iy];
|
||||
auto qy = (const block_q8_0 *)q8.y[iy];
|
||||
auto sumi = dot(qy[ib].qs+16);
|
||||
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(ggml_bf16_t{qy[ib].d})));
|
||||
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_FP16_TO_FP32(qy[ib].d)));
|
||||
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
}
|
||||
@@ -1968,6 +1968,9 @@ bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mu
|
||||
break;
|
||||
case GGML_TYPE_Q8_0_R8:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_0_r8_q8_2, kernels)
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL_R4:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq4_nl_r4_q8_2, kernels)
|
||||
|
||||
@@ -244,7 +244,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
//case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
@@ -278,7 +278,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
//case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
|
||||
Reference in New Issue
Block a user