Refactor iqk: Factor out GEMM for repacked legacy quants

This commit is contained in:
Iwan Kawrakow
2025-05-18 10:20:54 +03:00
parent 7868545062
commit 6cd3609a85
6 changed files with 988 additions and 1048 deletions

View File

@@ -138,6 +138,27 @@ typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& inf
#define IQK_MAX_NY 8
#define IQK_SET_MUL_MAT_FUNCTIONS_T(kernel, Dequantizer, funcs) \
funcs[0] = kernel<Dequantizer, 1>;\
funcs[1] = kernel<Dequantizer, 2>;\
funcs[2] = kernel<Dequantizer, 3>;\
funcs[3] = kernel<Dequantizer, 4>;\
funcs[4] = kernel<Dequantizer, 5>;\
funcs[5] = kernel<Dequantizer, 6>;\
funcs[6] = kernel<Dequantizer, 7>;\
funcs[7] = kernel<Dequantizer, 8>;\
#define IQK_SET_MUL_MAT_FUNCTIONS(kernel, funcs) \
funcs[0] = kernel<1>;\
funcs[1] = kernel<2>;\
funcs[2] = kernel<3>;\
funcs[3] = kernel<4>;\
funcs[4] = kernel<5>;\
funcs[5] = kernel<6>;\
funcs[6] = kernel<7>;\
funcs[7] = kernel<8>;\
// ==================================================================================================
static inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
@@ -234,6 +255,13 @@ static inline __m256i load_iq4nl_values_256() {
return MM256_SET_M128I(val128, val128);
}
#ifdef HAVE_FANCY_SIMD
static inline __m512i load_iq4nl_values_512() {
auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
#endif
static inline __m128i load_iq4k_values_128() {
return _mm_loadu_si128((const __m128i *)iq4k_values);
}

View File

@@ -11,11 +11,6 @@ namespace {
#ifdef HAVE_FANCY_SIMD
__m512i inline load_iq4nl_values_512() {
auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
struct IQXKScales {
IQXKScales(uint8_t shift, int8_t min_val) : eshift(_mm256_set1_epi16(shift)), min(_mm256_set1_epi16(min_val)) {}
template <typename Q8>

View File

@@ -285,11 +285,6 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
};
__m512i inline load_iq4nl_values_512() {
auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
template <typename Q8>

File diff suppressed because it is too large Load Diff

View File

@@ -6,6 +6,6 @@
#include <array>
bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels);
bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels, mul_mat_t& func16);
#endif

File diff suppressed because it is too large Load Diff