mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-10 22:10:20 +00:00
mxfp4: AVX2 GEMM
This commit is contained in:
@@ -1301,14 +1301,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_iq4_nl,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
|
||||
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
||||
#if GGML_USE_IQK_MULMAT
|
||||
#if defined HAVE_FANCY_SIMD
|
||||
.vec_dot_type = GGML_TYPE_Q8_2_X4,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_0_X4,
|
||||
#endif
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#endif
|
||||
.nrows = 1,
|
||||
.row_meta_size = 0,
|
||||
@@ -1335,7 +1331,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_mxfp4,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
|
||||
.vec_dot = vec_dot_mxfp4_q8_0_x4,
|
||||
#if defined HAVE_FANCY_SIMD
|
||||
#if defined __AVX2__
|
||||
.vec_dot_type = GGML_TYPE_Q8_2_X4,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_0_X4,
|
||||
|
||||
@@ -105,6 +105,21 @@ struct ScaleHelperQ_0 {
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_0_MXFP4 {
|
||||
float scales[4];
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) scales[j] = GGML_E8M0_TO_FP32_HALF(y[j].e);
|
||||
return _mm_loadu_ps(scales);
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(__m128 other_scales, const Q * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_E8M0_TO_FP32_HALF(y->e); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
template <int min_value>
|
||||
struct ScaleHelperQ_0_1 {
|
||||
ggml_half scales8[4];
|
||||
@@ -582,11 +597,7 @@ static inline __m256i load_mxfp4_values_256() {
|
||||
|
||||
struct MXFP4_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
const __m256i values = load_unsigned_mxfp4_values_256();
|
||||
#else
|
||||
const __m256i values = load_mxfp4_values_256();
|
||||
#endif
|
||||
inline __m256i dequant(const block_mxfp4 * x) const {
|
||||
return _mm256_shuffle_epi8(values, b4.dequant(x->qs));
|
||||
}
|
||||
@@ -712,20 +723,20 @@ struct Q4_0_1_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0_1<8>
|
||||
using Sum4T = Sum4q4<block_q8_2, block_q8_2_x4>;
|
||||
inline static int block_size() { return QK4_0; }
|
||||
};
|
||||
struct MXFP4_Unpacker final : public Q_Unpacker<block_mxfp4, ScaleHelperQ_0_1_MXFP4<12>, MXFP4_Dequantizer> {
|
||||
MXFP4_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
struct IQ4_MXFP4_Unpacker final : public Q_Unpacker<block_mxfp4, ScaleHelperQ_0_1_MXFP4<12>, MXFP4_Dequantizer> {
|
||||
IQ4_MXFP4_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#else
|
||||
struct IQ4_MXFP4_Unpacker final : public Q_Unpacker<block_mxfp4, ScaleHelperQ_0, MXFP4_Dequantizer> {
|
||||
IQ4_MXFP4_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL0_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
@@ -1867,7 +1878,7 @@ template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
|
||||
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs)
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_Unpacker> || std::is_same_v<Dequantizer, IQ4_MXFP4_Unpacker>) {
|
||||
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs)
|
||||
#else
|
||||
@@ -1875,7 +1886,8 @@ template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX
|
||||
#endif
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker>) {
|
||||
std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, MXFP4_Unpacker>) {
|
||||
IQK_SET_MUL_MAT_FUNCTIONS_T(mul_mat_qX_1_q8_2_T, Dequantizer, funcs)
|
||||
}
|
||||
}
|
||||
@@ -1936,10 +1948,10 @@ bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mu
|
||||
#endif
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
set_functions<IQ4_MXFP4_Unpacker>(kernels);
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
set_functions<MXFP4_Unpacker>(kernels);
|
||||
//#ifndef HAVE_FANCY_SIMD
|
||||
// expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
//#endif
|
||||
break;
|
||||
case GGML_TYPE_Q4_0_R8:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q4_0_r8_q8_2, kernels)
|
||||
|
||||
@@ -265,7 +265,9 @@ struct MulMat {
|
||||
case GGML_TYPE_Q5_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q5_1 : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q6_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
case GGML_TYPE_IQ4_NL : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
#endif
|
||||
case GGML_TYPE_MXFP4 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q8_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ1_KT : return nrc_y >= 16 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
|
||||
Reference in New Issue
Block a user