|
|
|
|
@@ -79,6 +79,16 @@ template <typename Q8, typename Q8x4> struct Sum4q4 {
|
|
|
|
|
inline __m256i compute(__m256i x, __m256i y) const { return _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(x, y)); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline __m256 convert_scales(const uint16_t * scales) {
|
|
|
|
|
auto aux_d = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16));
|
|
|
|
|
auto aux_m = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(scales+4))));
|
|
|
|
|
return _mm256_set_m128(_mm_mul_ps(aux_d, aux_m), aux_d);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline __m128 convert_scales_s(const uint16_t * scales) {
|
|
|
|
|
return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ScaleHelperQ8_0 {
|
|
|
|
|
inline __m128 prepare4(const block_q8_0 * y) {
|
|
|
|
|
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
|
|
|
|
|
@@ -106,6 +116,20 @@ struct ScaleHelperQ_0 {
|
|
|
|
|
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ScaleHelperQ8_2S {
|
|
|
|
|
template <typename Q>
|
|
|
|
|
inline __m128 prepare4(const Q * y) {
|
|
|
|
|
const block_q8_2_x4 * y4 = (const block_q8_2_x4 *)y;
|
|
|
|
|
return convert_scales_s((const uint16_t *)y4->d);
|
|
|
|
|
}
|
|
|
|
|
template <typename Q>
|
|
|
|
|
inline __m128 prepare4(__m128 other_scales, const Q * y) {
|
|
|
|
|
return _mm_mul_ps(other_scales, prepare4<Q>(y));
|
|
|
|
|
}
|
|
|
|
|
template <typename Q> static inline float prepare1(const Q * y) { return GGML_BF16_TO_FP32(ggml_bf16_t{y->d}); }
|
|
|
|
|
template <typename Q> static inline float prepare1(float d, const Q * y) { return d*prepare1(y); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct ScaleHelperQ_0_MXFP4 {
|
|
|
|
|
float scales[4];
|
|
|
|
|
template <typename Q>
|
|
|
|
|
@@ -188,12 +212,6 @@ struct ScaleHelperQ8_1 {
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline __m256 convert_scales(const uint16_t * scales) {
|
|
|
|
|
auto aux_d = _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales)), 16));
|
|
|
|
|
auto aux_m = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(scales+4))));
|
|
|
|
|
return _mm256_set_m128(_mm_mul_ps(aux_d, aux_m), aux_d);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ScaleHelperQ8_2 {
|
|
|
|
|
template <typename Q>
|
|
|
|
|
inline __m256 prepare4(const Q * y) {
|
|
|
|
|
@@ -348,6 +366,7 @@ using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;
|
|
|
|
|
|
|
|
|
|
using Sum4TypeQ80 = Sum4<block_q8_0, block_q8_0_x4, SignedDot, false>;
|
|
|
|
|
using Sum4TypeQ82 = Sum4<block_q8_2, block_q8_2_x4, UnsignedDot, false>;
|
|
|
|
|
using Sum4TypeQ82S = Sum4<block_q8_2, block_q8_2_x4, SignedDot, false>;
|
|
|
|
|
|
|
|
|
|
template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y>
|
|
|
|
|
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
|
|
|
|
|
@@ -374,10 +393,35 @@ void mul_mat_qX_q8_Helper_x2(int nb, const void * vx, size_t bx, const DataInfo&
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename Unpacker, int nrc_y>
|
|
|
|
|
template <typename Unpacker, int nrc_y, typename Block = block_q8_0>
|
|
|
|
|
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
|
|
|
|
assert(n%Unpacker::block_size() == 0);
|
|
|
|
|
Q8<nrc_y, block_q8_0> q8(info);
|
|
|
|
|
Q8<nrc_y, Block> q8(info);
|
|
|
|
|
int nb = n/Unpacker::block_size();
|
|
|
|
|
if constexpr (std::is_same_v<Block, block_q8_2>) {
|
|
|
|
|
if (nb%4 == 0) {
|
|
|
|
|
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, true>, ScaleHelperQ8_2S, Block, nrc_y>(
|
|
|
|
|
nb, vx, bx, info, q8.y, nrc_x);
|
|
|
|
|
} else {
|
|
|
|
|
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, false>, ScaleHelperQ8_2S, Block, nrc_y>(
|
|
|
|
|
nb, vx, bx, info, q8.y, nrc_x);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
if (nb%4 == 0) {
|
|
|
|
|
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, true>, ScaleHelperQ8_0, Block, nrc_y>(
|
|
|
|
|
nb, vx, bx, info, q8.y, nrc_x);
|
|
|
|
|
} else {
|
|
|
|
|
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, false>, ScaleHelperQ8_0, Block, nrc_y>(
|
|
|
|
|
nb, vx, bx, info, q8.y, nrc_x);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename Unpacker, int nrc_y>
|
|
|
|
|
void mul_mat_qX_0_q8_2_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
|
|
|
|
assert(n%Unpacker::block_size() == 0);
|
|
|
|
|
Q8<nrc_y, block_q8_2> q8(info);
|
|
|
|
|
int nb = n/Unpacker::block_size();
|
|
|
|
|
if (nb%4 == 0) {
|
|
|
|
|
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, true>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
|
|
|
|
@@ -393,11 +437,11 @@ void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
|
|
|
|
|
template <typename Unpacker, int nrc_y, int nrc_x>
|
|
|
|
|
void mul_mat_qX_0_q8_0_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) {
|
|
|
|
|
static_assert(8%nrc_y == 0);
|
|
|
|
|
Q8<nrc_y, block_q8_0> q8(info);
|
|
|
|
|
Q8<nrc_y, block_q8_2> q8(info);
|
|
|
|
|
int nb = n/Unpacker::block_size();
|
|
|
|
|
Unpacker unp(vx, bx);
|
|
|
|
|
typename Unpacker::Sum4T sum4;
|
|
|
|
|
ScaleHelperQ8_0 scales;
|
|
|
|
|
ScaleHelperQ8_2S scales;
|
|
|
|
|
__m256 result[8];
|
|
|
|
|
auto store = [&info, &result] (int ix0) {
|
|
|
|
|
if constexpr (nrc_y == 1) {
|
|
|
|
|
@@ -549,19 +593,15 @@ struct Q4_0_1_Dequantizer {
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct IQ4_NL_Dequantizer {
|
|
|
|
|
struct IQ4_NL_DequantizerU {
|
|
|
|
|
Dequantizer4bit b4;
|
|
|
|
|
#ifdef HAVE_FANCY_SIMD
|
|
|
|
|
const __m256i values = load_iq4nl_values_256();
|
|
|
|
|
#else
|
|
|
|
|
const __m256i values = load_iq4k_values_256();
|
|
|
|
|
#endif
|
|
|
|
|
inline __m256i dequant(const block_iq4_nl * x) const {
|
|
|
|
|
return _mm256_shuffle_epi8(values, b4.dequant(x->qs));
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct IQ4_NL0_Dequantizer {
|
|
|
|
|
struct IQ4_NL_DequantizerS {
|
|
|
|
|
Dequantizer4bit b4;
|
|
|
|
|
const __m256i values = load_iq4k_values_256();
|
|
|
|
|
inline __m256i dequant(const block_iq4_nl * x) const {
|
|
|
|
|
@@ -705,7 +745,7 @@ struct Q_Unpacker {
|
|
|
|
|
|
|
|
|
|
struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
|
|
|
|
|
Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ80;
|
|
|
|
|
using Sum4T = Sum4TypeQ82S;
|
|
|
|
|
inline static int block_size() { return QK8_0; }
|
|
|
|
|
};
|
|
|
|
|
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
|
|
|
|
|
@@ -713,6 +753,11 @@ struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<12
|
|
|
|
|
using Sum4T = Sum4TypeQ82;
|
|
|
|
|
inline static int block_size() { return QK8_0; }
|
|
|
|
|
};
|
|
|
|
|
struct Q8_0_2_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
|
|
|
|
|
Q8_0_2_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ82;
|
|
|
|
|
inline static int block_size() { return QK8_0; }
|
|
|
|
|
};
|
|
|
|
|
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
|
|
|
|
|
Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ80;
|
|
|
|
|
@@ -729,19 +774,16 @@ struct MXFP4_Unpacker final : public Q_Unpacker<block_mxfp4, ScaleHelperQ_0_1_MX
|
|
|
|
|
using Sum4T = Sum4TypeQ82;
|
|
|
|
|
inline static int block_size() { return QK4_NL; }
|
|
|
|
|
};
|
|
|
|
|
#ifdef HAVE_FANCY_SIMD
|
|
|
|
|
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> {
|
|
|
|
|
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
struct IQ4_NL_UnpackerU final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_DequantizerU> {
|
|
|
|
|
IQ4_NL_UnpackerU(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ82;
|
|
|
|
|
inline static int block_size() { return QK4_NL; }
|
|
|
|
|
};
|
|
|
|
|
#else
|
|
|
|
|
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL0_Dequantizer> {
|
|
|
|
|
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ80;
|
|
|
|
|
struct IQ4_NL_UnpackerS final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL_DequantizerS> {
|
|
|
|
|
IQ4_NL_UnpackerS(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ82S;
|
|
|
|
|
inline static int block_size() { return QK4_NL; }
|
|
|
|
|
};
|
|
|
|
|
#endif
|
|
|
|
|
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
|
|
|
|
|
Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
|
|
|
|
using Sum4T = Sum4TypeQ80;
|
|
|
|
|
@@ -1872,19 +1914,20 @@ void iqk_convert_qX_1_q8_1_r8(int n, const void * vx, size_t bx, void * vy, int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX_NY>& funcs) {
|
|
|
|
|
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
|
|
|
|
|
std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
|
|
|
|
|
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker>) {
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_0_q8_0_T, Dequantizer, funcs)
|
|
|
|
|
}
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T2(mul_mat_qX_0_q8_0_T, Dequantizer, block_q8_2, funcs)
|
|
|
|
|
}
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs)
|
|
|
|
|
}
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
|
|
|
|
|
#ifdef HAVE_FANCY_SIMD
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_UnpackerU>) {
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs)
|
|
|
|
|
#else
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_0_q8_0_T, Dequantizer, funcs)
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_UnpackerS>) {
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS_T2(mul_mat_qX_0_q8_0_T, Dequantizer, block_q8_2, funcs)
|
|
|
|
|
}
|
|
|
|
|
else if constexpr (std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> ||
|
|
|
|
|
std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker> ||
|
|
|
|
|
@@ -1902,7 +1945,7 @@ bool iqk_convert_legacy_quants_q8_r8(int type, int n, const void * vx, size_t bx
|
|
|
|
|
case GGML_TYPE_Q5_0 : iqk_convert_qX_q80_r8<block_q5_0, Q5_0_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_Q5_1 : iqk_convert_qX_1_q8_1_r8<block_q5_1, Q5_1_Dequantizer<block_q5_1>>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_Q6_0 : iqk_convert_qX_q80_r8<block_q6_0, Q6_0_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_IQ4_NL: iqk_convert_qX_q80_r8<block_iq4_nl, IQ4_NL0_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_IQ4_NL: iqk_convert_qX_q80_r8<block_iq4_nl, IQ4_NL_DequantizerS>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_Q8_0 : iqk_convert_q80_q80_r8(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
case GGML_TYPE_MXFP4 : iqk_convert_qX_q80_r8<block_mxfp4, MXFP40_Dequantizer>(n, vx, bx, vy, nrc_x); break;
|
|
|
|
|
default: return false;
|
|
|
|
|
@@ -1939,20 +1982,17 @@ bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mu
|
|
|
|
|
set_functions<Q8_0_1_Unpacker>(kernels);
|
|
|
|
|
#else
|
|
|
|
|
set_functions<Q8_0_Unpacker>(kernels);
|
|
|
|
|
expected_typeB = GGML_TYPE_Q8_0_X4;
|
|
|
|
|
#endif
|
|
|
|
|
break;
|
|
|
|
|
case GGML_TYPE_IQ4_NL:
|
|
|
|
|
set_functions<IQ4_NL_Unpacker>(kernels);
|
|
|
|
|
#ifndef HAVE_FANCY_SIMD
|
|
|
|
|
expected_typeB = GGML_TYPE_Q8_0_X4;
|
|
|
|
|
#ifdef HAVE_FANCY_SIMD
|
|
|
|
|
set_functions<IQ4_NL_UnpackerU>(kernels);
|
|
|
|
|
#else
|
|
|
|
|
set_functions<IQ4_NL_UnpackernS>(kernels);
|
|
|
|
|
#endif
|
|
|
|
|
break;
|
|
|
|
|
case GGML_TYPE_MXFP4:
|
|
|
|
|
set_functions<MXFP4_Unpacker>(kernels);
|
|
|
|
|
//#ifndef HAVE_FANCY_SIMD
|
|
|
|
|
// expected_typeB = GGML_TYPE_Q8_0_X4;
|
|
|
|
|
//#endif
|
|
|
|
|
break;
|
|
|
|
|
case GGML_TYPE_Q4_0_R8:
|
|
|
|
|
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q4_0_r8_q8_2, kernels)
|
|
|
|
|
@@ -3223,6 +3263,19 @@ inline std::pair<mul_mat_t, int> mul_mat_kernel(int int_typeA, int nq) {
|
|
|
|
|
case 7: return std::make_pair(mul_mat, 7>, 7);\
|
|
|
|
|
}\
|
|
|
|
|
}
|
|
|
|
|
#define MAKE_FUNCS2(mul_mat, block, n) \
|
|
|
|
|
if (n >= kMaxQ) return std::make_pair(mul_mat, kMaxQ, block>, kMaxQ);\
|
|
|
|
|
else {\
|
|
|
|
|
switch (n) {\
|
|
|
|
|
case 1: return std::make_pair(mul_mat, 1, block>, 1);\
|
|
|
|
|
case 2: return std::make_pair(mul_mat, 2, block>, 2);\
|
|
|
|
|
case 3: return std::make_pair(mul_mat, 3, block>, 3);\
|
|
|
|
|
case 4: return std::make_pair(mul_mat, 4, block>, 4);\
|
|
|
|
|
case 5: return std::make_pair(mul_mat, 5, block>, 5);\
|
|
|
|
|
case 6: return std::make_pair(mul_mat, 6, block>, 6);\
|
|
|
|
|
case 7: return std::make_pair(mul_mat, 7, block>, 7);\
|
|
|
|
|
}\
|
|
|
|
|
}
|
|
|
|
|
#define MAKE_FUNCS_ONLY_NRC(mul_mat, n) \
|
|
|
|
|
if (n >= kMaxQ) return std::make_pair(mul_mat<kMaxQ>, kMaxQ);\
|
|
|
|
|
else {\
|
|
|
|
|
@@ -3249,7 +3302,11 @@ inline std::pair<mul_mat_t, int> mul_mat_kernel(int int_typeA, int nq) {
|
|
|
|
|
if (nq == 1) return std::make_pair(mul_mat_qX_0_q8_0_Tx<Q8_0_Unpacker, 1, k_step>, 1);
|
|
|
|
|
if (nq == 2) return std::make_pair(mul_mat_qX_0_q8_0_Tx<Q8_0_Unpacker, 2, k_step>, 2);
|
|
|
|
|
if (nq == 4) return std::make_pair(mul_mat_qX_0_q8_0_Tx<Q8_0_Unpacker, 4, k_step>, 4);
|
|
|
|
|
MAKE_FUNCS(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, nq);
|
|
|
|
|
if (nq == 3) return std::make_pair(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, 3, block_q8_2>, 3);
|
|
|
|
|
if (nq == 5) return std::make_pair(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, 5, block_q8_2>, 5);
|
|
|
|
|
if (nq == 6) return std::make_pair(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, 6, block_q8_2>, 6);
|
|
|
|
|
if (nq == 7) return std::make_pair(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, 7, block_q8_2>, 7);
|
|
|
|
|
return std::make_pair(mul_mat_qX_0_q8_0_T<Q8_0_Unpacker, kMaxQ, block_q8_2>, kMaxQ);
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
@@ -3293,9 +3350,9 @@ inline std::pair<mul_mat_t, int> mul_mat_kernel(int int_typeA, int nq) {
|
|
|
|
|
MAKE_FUNCS(mul_mat_qX_0_q8_0<DequantizerIQ4NL, nq);
|
|
|
|
|
#else
|
|
|
|
|
#ifdef HAVE_FANCY_SIMD
|
|
|
|
|
MAKE_FUNCS(mul_mat_qX_1_q8_2_T<IQ4_NL_Unpacker, nq);
|
|
|
|
|
MAKE_FUNCS(mul_mat_qX_1_q8_2_T<IQ4_NL_UnpackerU, nq);
|
|
|
|
|
#else
|
|
|
|
|
MAKE_FUNCS(mul_mat_qX_0_q8_0_T<IQ4_NL_Unpacker, nq);
|
|
|
|
|
MAKE_FUNCS2(mul_mat_qX_0_q8_0_T<IQ4_NL_UnpackerS, block_q8_2, nq);
|
|
|
|
|
#endif
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|