mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 15:14:10 +00:00
Refactor iqk: Factor out GEMM for legacy quants (AVX2/AVX512)
This commit is contained in:
@@ -260,10 +260,12 @@ if (GGML_IQK_MUL_MAT)
|
||||
add_compile_definitions(GGML_USE_IQK_MULMAT)
|
||||
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp
|
||||
iqk/iqk_flash_attn.cpp
|
||||
iqk/iqk_gemm_floats.cpp)
|
||||
iqk/iqk_gemm_floats.cpp
|
||||
iqk/iqk_gemm_legacy_quants.cpp)
|
||||
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h
|
||||
iqk/iqk_flash_impl.h
|
||||
iqk/iqk_gemm_floats.h)
|
||||
iqk/iqk_gemm_floats.h
|
||||
iqk/iqk_gemm_legacy_quants.h)
|
||||
if (GGML_IQK_FLASH_ATTENTION)
|
||||
message(STATUS "Enabling IQK Flash Attention kernels")
|
||||
add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "iqk_config.h"
|
||||
|
||||
#if defined IQK_IMPLEMENT
|
||||
@@ -135,4 +137,105 @@ typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& inf
|
||||
|
||||
#define IQK_MAX_NY 8
|
||||
|
||||
// ==================================================================================================
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||
|
||||
static inline float hsum_float_4(__m128 x) {
|
||||
x = _mm_add_ps(x, _mm_movehl_ps(x, x));
|
||||
x = _mm_add_ss(x, _mm_movehdup_ps(x));
|
||||
return _mm_cvtss_f32(x);
|
||||
}
|
||||
static inline float hsum_float_8(__m256 x) {
|
||||
return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
|
||||
}
|
||||
static inline int hsum_i32_8(const __m256i a) {
|
||||
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
||||
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
|
||||
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
|
||||
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||
}
|
||||
static inline float hmax_float_8(__m256 x) {
|
||||
__m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
|
||||
max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
|
||||
return _mm_cvtss_f32(max4);
|
||||
}
|
||||
|
||||
static inline __m256 hsum_float_8x8(__m256 * accm) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
accm[i] = _mm256_add_ps(_mm256_permute2f128_ps(accm[i], accm[i+4], 0x20), _mm256_permute2f128_ps(accm[i], accm[i+4], 0x31));
|
||||
//accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)),
|
||||
// _mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1)));
|
||||
}
|
||||
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
|
||||
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
|
||||
}
|
||||
|
||||
static inline __m128i load_iq4nl_values_128() {
|
||||
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
|
||||
return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
||||
}
|
||||
|
||||
static inline __m256i load_iq4nl_values_256() {
|
||||
auto val128 = load_iq4nl_values_128();
|
||||
return MM256_SET_M128I(val128, val128);
|
||||
}
|
||||
|
||||
static inline __m128i load_iq4k_values_128() {
|
||||
return _mm_loadu_si128((const __m128i *)iq4k_values);
|
||||
}
|
||||
|
||||
static inline __m256i load_iq4k_values_256() {
|
||||
auto val128 = load_iq4k_values_128();
|
||||
return MM256_SET_M128I(val128, val128);
|
||||
}
|
||||
|
||||
template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
|
||||
|
||||
constexpr static int nrc_y = nrc;
|
||||
|
||||
Q8(const DataInfo& info) {
|
||||
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
|
||||
#endif
|
||||
inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
|
||||
inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
|
||||
inline float scale(int iy, int i) const { return y[iy][i].d; }
|
||||
|
||||
const block_q8 * y[nrc_y];
|
||||
};
|
||||
|
||||
template <int nrc> struct Q8_16 {
|
||||
|
||||
constexpr static int nrc_y = nrc;
|
||||
|
||||
Q8_16(const DataInfo& info) {
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto ptr = (const float *)info.src1_row(iy);
|
||||
std::memcpy(d + 5*iy, ptr, 5*sizeof(float));
|
||||
y[iy] = (const int8_t *)(ptr + 5);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
inline __m512i load_quants64(int iy, int i) const { return _mm512_loadu_si512((const __m512i*)y[iy] + i); }
|
||||
#endif
|
||||
inline __m256i load_quants(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy] + i); }
|
||||
inline float scale(int iy, int k) const { return d[5*iy+k]; }
|
||||
inline float sum_row(int iy) const { return d[5*iy + 4]; }
|
||||
inline __m128 scale(int iy) const { return _mm_loadu_ps(d + 5*iy); }
|
||||
|
||||
float d[5*nrc_y];
|
||||
const int8_t * y[nrc_y];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
789
ggml/src/iqk/iqk_gemm_legacy_quants.cpp
Normal file
789
ggml/src/iqk/iqk_gemm_legacy_quants.cpp
Normal file
@@ -0,0 +1,789 @@
|
||||
#include "iqk_gemm_legacy_quants.h"
|
||||
|
||||
#ifdef IQK_IMPLEMENT
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#define GGML_COMMON_IMPL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
//
|
||||
// ============================== Legacy quants
|
||||
//
|
||||
|
||||
namespace {
|
||||
|
||||
struct DotHelper {
|
||||
const __m256i m1 = _mm256_set1_epi16(1);
|
||||
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||
inline __m256i dot(__m256i x, __m256i y) const {
|
||||
return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
|
||||
}
|
||||
#else
|
||||
inline __m256i dot(__m256i x, __m256i y) const {
|
||||
return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct SignedDot {
|
||||
DotHelper helper;
|
||||
inline __m256i compute(__m256i x, __m256i y) const {
|
||||
return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
|
||||
}
|
||||
};
|
||||
struct UnsignedDot {
|
||||
DotHelper helper;
|
||||
inline __m256i compute(__m256i x, __m256i y) const {
|
||||
return helper.dot(x, y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q8, typename Q8x4, typename Dot, bool can_pack = true> struct Sum4 {
|
||||
Dot dot;
|
||||
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
|
||||
const Q8x4 * y4 = (const Q8x4 *)y;
|
||||
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); // 8x block 0
|
||||
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); // 8x block 1
|
||||
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); // 8x block 2
|
||||
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); // 8x block 3
|
||||
if constexpr (can_pack) {
|
||||
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
|
||||
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
|
||||
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
} else {
|
||||
// Note to myself: this is much faster than using _mm256_hadd_epi32()
|
||||
auto p01 = _mm256_add_epi32(_mm256_unpacklo_epi32(p0, p1), _mm256_unpackhi_epi32(p0, p1)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto p23 = _mm256_add_epi32(_mm256_unpacklo_epi32(p2, p3), _mm256_unpackhi_epi32(p2, p3)); // 2,3, 2,3, 2,3, 2,3
|
||||
return _mm256_add_epi32(_mm256_unpacklo_epi64(p01, p23), _mm256_unpackhi_epi64(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
}
|
||||
}
|
||||
inline __m256i compute(__m256i x, __m256i y) const { return dot.compute(x, y); }
|
||||
};
|
||||
|
||||
template <typename Q8, typename Q8x4> struct Sum4q4 {
|
||||
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
|
||||
const Q8x4 * y4 = (const Q8x4 *)y;
|
||||
auto p0 = _mm256_maddubs_epi16(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); // 16x block 0
|
||||
auto p1 = _mm256_maddubs_epi16(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); // 16x block 1
|
||||
auto p2 = _mm256_maddubs_epi16(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); // 16x block 2
|
||||
auto p3 = _mm256_maddubs_epi16(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); // 16x block 3
|
||||
auto p01 = _mm256_add_epi16(_mm256_unpacklo_epi32(p0, p1), _mm256_unpackhi_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1, 0,0, 1,1, 0,0, 1,1
|
||||
auto p23 = _mm256_add_epi16(_mm256_unpacklo_epi32(p2, p3), _mm256_unpackhi_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3, 2,2, 3,3, 2,2, 3,3
|
||||
auto p0123 = _mm256_add_epi16(_mm256_unpacklo_epi64(p01, p23), _mm256_unpackhi_epi64(p01, p23)); // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3
|
||||
return _mm256_madd_epi16(_mm256_set1_epi16(1), p0123);
|
||||
}
|
||||
inline __m256i compute(__m256i x, __m256i y) const { return _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(x, y)); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ8_0 {
|
||||
inline __m128 prepare4(const block_q8_0 * y) {
|
||||
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
|
||||
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)y4->d));
|
||||
}
|
||||
inline __m128 prepare4(__m128 other_scales, const block_q8_0 * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_0 {
|
||||
ggml_half scales8[4];
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(__m128 other_scales, const Q * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
template <int min_value>
|
||||
struct ScaleHelperQ_0_1 {
|
||||
ggml_half scales8[4];
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
|
||||
return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm_mul256_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
float d = GGML_FP16_TO_FP32(y->d);
|
||||
return std::make_pair(d, -d*float(min_value));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
const __m128 min = _mm_set1_ps(float(-min_value));
|
||||
};
|
||||
|
||||
//template <int min_value>
|
||||
//struct ScaleHelperQ_0_2 {
|
||||
// ggml_bf16_t scales8[4];
|
||||
// template <typename Q>
|
||||
// inline __m256 prepare4(const Q * y) {
|
||||
// for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
// auto s4 = _mm_castsi128_ps(_mm_slli_epi16(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales8)), 16));
|
||||
// return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
|
||||
// }
|
||||
// template <typename Q>
|
||||
// inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
// return _mm_mul256_ps(other_scales, prepare4<Q>(y));
|
||||
// }
|
||||
// template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
// float d = GGML_BF16_TO_FP32(y->d);
|
||||
// return std::make_pair(d, -d*float(min_value));
|
||||
// }
|
||||
// std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
// return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
// }
|
||||
// const __m128 min = _mm_set1_ps(float(-min_value));
|
||||
//};
|
||||
|
||||
struct ScaleHelperQ8_1 {
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y;
|
||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)y4->d));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
};
|
||||
|
||||
struct ScaleHelperQ8_2 {
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
const block_q8_2_x4 * y4 = (const block_q8_2_x4 *)y;
|
||||
auto aux = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)y4->d));
|
||||
return _mm256_castsi256_ps(_mm256_slli_epi32(aux, 16));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_BF16_TO_FP32(y->d), GGML_BF16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s;
|
||||
return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_2 * y) const {
|
||||
ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s;
|
||||
return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s));
|
||||
}
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_1 {
|
||||
uint32_t scales8[4];
|
||||
const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
|
||||
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
// it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
|
||||
// complain that this breaks strict-aliasing rules.
|
||||
memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
|
||||
}
|
||||
return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
|
||||
}
|
||||
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
};
|
||||
|
||||
struct MinusType0 {
|
||||
inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
|
||||
inline float compute(float d, int) const { return d; }
|
||||
inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
|
||||
inline __m256 vresult(__m256 acc, int) const { return acc; }
|
||||
};
|
||||
|
||||
template <int nrc_y> struct MinusType1 {
|
||||
__m128 accm[nrc_y];
|
||||
MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
|
||||
inline __m256 compute(__m256 dm, int iy) {
|
||||
const __m128 d = _mm256_castps256_ps128(dm);
|
||||
const __m128 m = _mm256_extractf128_ps(dm, 1);
|
||||
accm[iy] = _mm_add_ps(accm[iy], m);
|
||||
return _mm256_set_m128(d, d);
|
||||
}
|
||||
inline float compute(const std::pair<float, float>& dm, int iy) {
|
||||
accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
|
||||
return dm.first;
|
||||
}
|
||||
inline float result(__m256 acc, int iy) const {
|
||||
const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
|
||||
return hsum_float_4(_mm_add_ps(sum, accm[iy]));
|
||||
}
|
||||
inline __m256 vresult(__m256 acc, int iy) const {
|
||||
return _mm256_add_ps(acc, _mm256_insertf128_ps(_mm256_setzero_ps(), accm[iy], 0));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
|
||||
__m256 acc[nrc_y];
|
||||
Minus accm;
|
||||
AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
|
||||
template <typename Unpacker, typename Scales, typename Sum, typename Q8>
|
||||
inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
|
||||
auto qx = unp.quants();
|
||||
__m256 dall[nrc_y];
|
||||
for (int i = 0; i < nb/4; ++i) {
|
||||
auto other_scales = unp.set_block_4(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
|
||||
dall[iy] = accm.compute(s12, iy);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto pall = sum.compute(qx, y[iy] + 4*i);
|
||||
acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
|
||||
}
|
||||
}
|
||||
if (!is_multiple_of_4) {
|
||||
for (int i = 4*(nb/4); i < nb; ++i) {
|
||||
auto other_scales = unp.set_block(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare1(other_scales, y[iy] + i);
|
||||
auto d = accm.compute(s12, iy);
|
||||
const __m256i p0 = sum.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
info.store(ix, iy, accm.result(acc[iy], iy));
|
||||
}
|
||||
}
|
||||
template <typename Unpacker, typename Scales, typename Sum, typename Q8>
|
||||
inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, __m256 * result) {
|
||||
auto qx = unp.quants();
|
||||
__m256 dall[nrc_y];
|
||||
for (int i = 0; i < nb/4; ++i) {
|
||||
auto other_scales = unp.set_block_4(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
|
||||
dall[iy] = accm.compute(s12, iy);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto pall = sum.compute(qx, y[iy] + 4*i);
|
||||
acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
|
||||
}
|
||||
}
|
||||
if (!is_multiple_of_4) {
|
||||
for (int i = 4*(nb/4); i < nb; ++i) {
|
||||
auto other_scales = unp.set_block(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare1(other_scales, y[iy] + i);
|
||||
auto d = accm.compute(s12, iy);
|
||||
const __m256i p0 = sum.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
result[iy] = accm.vresult(acc[iy], iy);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int nrc_y, bool is_multiple_of_4>
|
||||
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;
|
||||
|
||||
template <int nrc_y, bool is_multiple_of_4>
|
||||
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;
|
||||
|
||||
using Sum4TypeQ80 = Sum4<block_q8_0, block_q8_0_x4, SignedDot, false>;
|
||||
using Sum4TypeQ82 = Sum4<block_q8_2, block_q8_2_x4, UnsignedDot, false>;
|
||||
|
||||
template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y>
|
||||
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
Scales scales;
|
||||
for (int ix = 0; ix < nrc_x; ++ix) {
|
||||
unp.set_row(ix);
|
||||
AccumType accum;
|
||||
accum.compute(nb, unp, scales, sum4, y, info, ix);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y>
|
||||
void mul_mat_qX_q8_Helper_x2(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%2 == 0);
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
Scales scales;
|
||||
for (int ix = 0; ix < nrc_x; ix += 2) {
|
||||
unp.set_row(ix);
|
||||
AccumType accum;
|
||||
accum.compute(nb, unp, scales, sum4, y, info, ix);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_0> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, true>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, false>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y, int nrc_x>
|
||||
void mul_mat_qX_0_q8_0_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) {
|
||||
static_assert(8%nrc_y == 0);
|
||||
Q8<nrc_y, block_q8_0> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
ScaleHelperQ8_0 scales;
|
||||
__m256 result[8];
|
||||
auto store = [&info, &result] (int ix0) {
|
||||
if constexpr (nrc_y == 1) {
|
||||
info.store(ix0, 0, hsum_float_8x8(result));
|
||||
}
|
||||
else if constexpr (nrc_y == 2) {
|
||||
auto value = hsum_float_8x8(result);
|
||||
auto value1 = _mm256_extractf128_ps(value, 1);
|
||||
info.store(ix0, 0, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0x88));
|
||||
info.store(ix0, 1, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0xdd));
|
||||
}
|
||||
else {
|
||||
float val[8];
|
||||
_mm256_storeu_ps(val, hsum_float_8x8(result));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) for (int ix = 0; ix < 8/nrc_y; ++ix) info.store(ix0+ix, iy, val[nrc_y*ix+iy]);
|
||||
}
|
||||
};
|
||||
if (nb%4 == 0) {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType0<nrc_y, true> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
} else {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType0<nrc_y, false> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_1> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, true>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, false>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_1_q8_2_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_2> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, true>, ScaleHelperQ8_2, block_q8_2, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, false>, ScaleHelperQ8_2, block_q8_2, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y, int nrc_x>
|
||||
void mul_mat_qX_0_q8_2_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) {
|
||||
static_assert(8%nrc_y == 0);
|
||||
Q8<nrc_y, block_q8_2> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
ScaleHelperQ8_2 scales;
|
||||
__m256 result[8];
|
||||
auto store = [&info, &result] (int ix0) {
|
||||
if constexpr (nrc_y == 1) {
|
||||
info.store(ix0, 0, hsum_float_8x8(result));
|
||||
}
|
||||
else if constexpr (nrc_y == 2) {
|
||||
auto value = hsum_float_8x8(result);
|
||||
auto value1 = _mm256_extractf128_ps(value, 1);
|
||||
info.store(ix0, 0, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0x88));
|
||||
info.store(ix0, 1, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0xdd));
|
||||
}
|
||||
else {
|
||||
float val[8];
|
||||
_mm256_storeu_ps(val, hsum_float_8x8(result));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) for (int ix = 0; ix < 8/nrc_y; ++ix) info.store(ix0+ix, iy, val[nrc_y*ix+iy]);
|
||||
}
|
||||
};
|
||||
if (nb%4 == 0) {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType1<nrc_y, true> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
} else {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType1<nrc_y, false> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Dequantizer4bit {
|
||||
const __m256i m4 = _mm256_set1_epi8(0xf);
|
||||
inline __m256i dequant(const uint8_t * qs) const {
|
||||
const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
|
||||
return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_Dequantizer {
|
||||
inline __m256i dequant(const block_q8_0 * x) const {
|
||||
return _mm256_loadu_si256((const __m256i *)x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_1_Dequantizer {
|
||||
inline __m256i dequant(const block_q8_0 * x) const {
|
||||
return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_0_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
const __m256i m8 = _mm256_set1_epi8(-8);
|
||||
inline __m256i dequant(const block_q4_0 * x) const {
|
||||
return _mm256_add_epi8(b4.dequant(x->qs), m8);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_0_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
inline __m256i dequant(const block_q4_0 * x) const {
|
||||
return b4.dequant(x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct IQ4_NL_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
const __m256i values = load_iq4nl_values_256();
|
||||
#else
|
||||
const __m256i values = load_iq4k_values_256();
|
||||
#endif
|
||||
inline __m256i dequant(const block_iq4_nl * x) const {
|
||||
return _mm256_shuffle_epi8(values, b4.dequant(x->qs));
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
inline __m256i dequant(const block_q4_1 * x) const {
|
||||
return b4.dequant(x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct HBitDequantizer {
|
||||
const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
||||
const __m256i minus1 = _mm256_set1_epi64x(-1);
|
||||
inline __m256i to_bytes(const uint8_t * bits) const {
|
||||
// Note: Data in all ggml quants is at least 2-byte aligned.
|
||||
// => we can cast to uint16_t and use or on two consecutive entries
|
||||
// which is faster than memcpy
|
||||
const uint16_t * aux16 = (const uint16_t *)bits;
|
||||
const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
|
||||
//uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
|
||||
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
|
||||
bytes = _mm256_or_si256(bytes, mask);
|
||||
return _mm256_cmpeq_epi8(bytes, minus1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q5_0_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
HBitDequantizer hbit;
|
||||
const __m256i mh = _mm256_set1_epi8((char)0xF0);
|
||||
inline __m256i dequant(const block_q5_0 * x) const {
|
||||
const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), vqh);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q5>
|
||||
struct Q5_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
HBitDequantizer hbit;
|
||||
const __m256i mh = _mm256_set1_epi8(0x10);
|
||||
inline __m256i dequant(const Q5 * x) const {
|
||||
const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), vqh);
|
||||
}
|
||||
};
|
||||
struct Q6_0_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
const __m256i mh = _mm256_set1_epi8(0x30);
|
||||
const __m256i shift1 = _mm256_set_epi64x(0, 2, 0, 4);
|
||||
const __m256i shift2 = _mm256_set_epi64x(2, 0, 0, 0);
|
||||
inline __m256i dequant(const block_q6_0 * x) const {
|
||||
uint64_t aux64; std::memcpy(&aux64, x->qh, 8);
|
||||
auto h256 = _mm256_sllv_epi64(_mm256_set1_epi64x(aux64), shift1);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), _mm256_and_si256(_mm256_srlv_epi64(h256, shift2), mh));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q, typename Scales, typename Dequantizer>
|
||||
struct Q_Unpacker {
|
||||
Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}
|
||||
|
||||
const char * cx_0;
|
||||
const Q * x;
|
||||
size_t bx;
|
||||
|
||||
Scales scales;
|
||||
Dequantizer deq;
|
||||
|
||||
__m256i qx[4];
|
||||
|
||||
inline const __m256i* quants() const { return qx; }
|
||||
|
||||
inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }
|
||||
|
||||
inline auto set_block_4(int i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
qx[j] = deq.dequant(x + 4*i + j);
|
||||
}
|
||||
return scales.prepare4(x + 4*i);
|
||||
}
|
||||
inline auto set_block(int i) {
|
||||
qx[0] = deq.dequant(x + i);
|
||||
return scales.prepare1(x + i);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
|
||||
Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK8_0; }
|
||||
};
|
||||
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
|
||||
Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK8_0; }
|
||||
};
|
||||
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
|
||||
Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK4_0; }
|
||||
};
|
||||
struct Q4_0_1_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0_1<8>, Q4_0_1_Dequantizer> {
|
||||
Q4_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
//using Sum4T = Sum4TypeQ82;
|
||||
using Sum4T = Sum4q4<block_q8_2, block_q8_2_x4>;
|
||||
inline static int block_size() { return QK4_0; }
|
||||
};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#else
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#endif
|
||||
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
|
||||
Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK5_0; }
|
||||
};
|
||||
struct Q5_0_1_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0_1<16>, Q5_1_Dequantizer<block_q5_0>> {
|
||||
Q5_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK5_0; }
|
||||
};
|
||||
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
|
||||
Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_1; }
|
||||
};
|
||||
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer<block_q5_1>> {
|
||||
Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK5_1; }
|
||||
};
|
||||
struct Q6_0_1_Unpacker final : public Q_Unpacker<block_q6_0, ScaleHelperQ_0_1<32>, Q6_0_1_Dequantizer> {
|
||||
Q6_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK6_0; }
|
||||
};
|
||||
|
||||
template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX_NY>& funcs) {
|
||||
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
|
||||
funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
|
||||
funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
|
||||
funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
|
||||
funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
|
||||
funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
|
||||
funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
|
||||
funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
|
||||
funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
#else
|
||||
funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
|
||||
funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
|
||||
funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
|
||||
funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
|
||||
funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
|
||||
funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
|
||||
funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
|
||||
#endif
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker>) {
|
||||
funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels) {
|
||||
|
||||
if (ne00%QK8_0 != 0) return false;
|
||||
|
||||
auto expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
|
||||
switch (typeA) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
set_functions<Q4_0_1_Unpacker>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
set_functions<Q4_1_Unpacker>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
set_functions<Q5_0_1_Unpacker>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
set_functions<Q5_1_Unpacker>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_Q6_0:
|
||||
set_functions<Q6_0_1_Unpacker>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
set_functions<Q8_0_1_Unpacker>(kernels);
|
||||
#else
|
||||
set_functions<Q8_0_Unpacker>(kernels);
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
set_functions<IQ4_NL_Unpacker>(kernels);
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return ggml_type(typeB) == expected_typeB;
|
||||
}
|
||||
|
||||
#endif
|
||||
11
ggml/src/iqk/iqk_gemm_legacy_quants.h
Normal file
11
ggml/src/iqk/iqk_gemm_legacy_quants.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "iqk_common.h"
|
||||
|
||||
#ifdef IQK_IMPLEMENT
|
||||
|
||||
#include <array>
|
||||
|
||||
bool iqk_set_kernels_legacy_quants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels);
|
||||
|
||||
#endif
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "iqk_quantize.h"
|
||||
#include "iqk_flash_impl.h"
|
||||
#include "iqk_gemm_floats.h"
|
||||
#include "iqk_gemm_legacy_quants.h"
|
||||
|
||||
#define GGML_COMMON_IMPL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -1313,72 +1314,6 @@ const uint64_t keven_signs[128] = {
|
||||
|
||||
namespace {
|
||||
|
||||
inline float hsum_float_4(__m128 x) {
|
||||
x = _mm_add_ps(x, _mm_movehl_ps(x, x));
|
||||
x = _mm_add_ss(x, _mm_movehdup_ps(x));
|
||||
return _mm_cvtss_f32(x);
|
||||
}
|
||||
inline float hsum_float_8(__m256 x) {
|
||||
return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1)));
|
||||
}
|
||||
inline int hsum_i32_8(const __m256i a) {
|
||||
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
||||
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
|
||||
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
|
||||
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||
}
|
||||
inline float hmax_float_8(__m256 x) {
|
||||
__m128 max4 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
|
||||
max4 = _mm_max_ps( max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4));
|
||||
return _mm_cvtss_f32(max4);
|
||||
}
|
||||
|
||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||
|
||||
template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
|
||||
|
||||
constexpr static int nrc_y = nrc;
|
||||
|
||||
Q8(const DataInfo& info) {
|
||||
for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy);
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
|
||||
#endif
|
||||
inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
|
||||
inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
|
||||
inline float scale(int iy, int i) const { return y[iy][i].d; }
|
||||
|
||||
const block_q8 * y[nrc_y];
|
||||
};
|
||||
|
||||
template <int nrc> struct Q8_16 {
|
||||
|
||||
constexpr static int nrc_y = nrc;
|
||||
|
||||
Q8_16(const DataInfo& info) {
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto ptr = (const float *)info.src1_row(iy);
|
||||
std::memcpy(d + 5*iy, ptr, 5*sizeof(float));
|
||||
y[iy] = (const int8_t *)(ptr + 5);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
inline __m512i load_quants64(int iy, int i) const { return _mm512_loadu_si512((const __m512i*)y[iy] + i); }
|
||||
#endif
|
||||
inline __m256i load_quants(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy] + i); }
|
||||
inline float scale(int iy, int k) const { return d[5*iy+k]; }
|
||||
inline float sum_row(int iy) const { return d[5*iy + 4]; }
|
||||
inline __m128 scale(int iy) const { return _mm_loadu_ps(d + 5*iy); }
|
||||
|
||||
float d[5*nrc_y];
|
||||
const int8_t * y[nrc_y];
|
||||
};
|
||||
|
||||
struct Scales8KBase {
|
||||
template <typename Q8>
|
||||
inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
|
||||
@@ -1645,25 +1580,6 @@ struct SimpleBits {
|
||||
__m256i values[4];
|
||||
};
|
||||
|
||||
__m128i inline load_iq4nl_values_128() {
|
||||
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
|
||||
return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
||||
}
|
||||
|
||||
__m256i inline load_iq4nl_values_256() {
|
||||
auto val128 = load_iq4nl_values_128();
|
||||
return MM256_SET_M128I(val128, val128);
|
||||
}
|
||||
|
||||
__m128i inline load_iq4k_values_128() {
|
||||
return _mm_loadu_si128((const __m128i *)iq4k_values);
|
||||
}
|
||||
|
||||
__m256i inline load_iq4k_values_256() {
|
||||
auto val128 = load_iq4k_values_128();
|
||||
return MM256_SET_M128I(val128, val128);
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
//====================================== Zen4 ==================================================
|
||||
|
||||
@@ -8462,750 +8378,10 @@ struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
|
||||
const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
|
||||
};
|
||||
|
||||
//
|
||||
// ============================== Legacy quants
|
||||
//
|
||||
|
||||
struct DotHelper {
|
||||
const __m256i m1 = _mm256_set1_epi16(1);
|
||||
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
||||
inline __m256i dot(__m256i x, __m256i y) const {
|
||||
return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y);
|
||||
}
|
||||
#else
|
||||
inline __m256i dot(__m256i x, __m256i y) const {
|
||||
return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct SignedDot {
|
||||
DotHelper helper;
|
||||
inline __m256i compute(__m256i x, __m256i y) const {
|
||||
return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x));
|
||||
}
|
||||
};
|
||||
struct UnsignedDot {
|
||||
DotHelper helper;
|
||||
inline __m256i compute(__m256i x, __m256i y) const {
|
||||
return helper.dot(x, y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q8, typename Q8x4, typename Dot, bool can_pack = true> struct Sum4 {
|
||||
Dot dot;
|
||||
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
|
||||
const Q8x4 * y4 = (const Q8x4 *)y;
|
||||
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); // 8x block 0
|
||||
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); // 8x block 1
|
||||
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); // 8x block 2
|
||||
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); // 8x block 3
|
||||
if constexpr (can_pack) {
|
||||
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
|
||||
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
|
||||
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
} else {
|
||||
// Note to myself: this is much faster than using _mm256_hadd_epi32()
|
||||
auto p01 = _mm256_add_epi32(_mm256_unpacklo_epi32(p0, p1), _mm256_unpackhi_epi32(p0, p1)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto p23 = _mm256_add_epi32(_mm256_unpacklo_epi32(p2, p3), _mm256_unpackhi_epi32(p2, p3)); // 2,3, 2,3, 2,3, 2,3
|
||||
return _mm256_add_epi32(_mm256_unpacklo_epi64(p01, p23), _mm256_unpackhi_epi64(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
}
|
||||
}
|
||||
inline __m256i compute(__m256i x, __m256i y) const { return dot.compute(x, y); }
|
||||
};
|
||||
|
||||
template <typename Q8, typename Q8x4> struct Sum4q4 {
|
||||
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
|
||||
const Q8x4 * y4 = (const Q8x4 *)y;
|
||||
auto p0 = _mm256_maddubs_epi16(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0)); // 16x block 0
|
||||
auto p1 = _mm256_maddubs_epi16(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1)); // 16x block 1
|
||||
auto p2 = _mm256_maddubs_epi16(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2)); // 16x block 2
|
||||
auto p3 = _mm256_maddubs_epi16(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3)); // 16x block 3
|
||||
auto p01 = _mm256_add_epi16(_mm256_unpacklo_epi32(p0, p1), _mm256_unpackhi_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1, 0,0, 1,1, 0,0, 1,1
|
||||
auto p23 = _mm256_add_epi16(_mm256_unpacklo_epi32(p2, p3), _mm256_unpackhi_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3, 2,2, 3,3, 2,2, 3,3
|
||||
auto p0123 = _mm256_add_epi16(_mm256_unpacklo_epi64(p01, p23), _mm256_unpackhi_epi64(p01, p23)); // 0,0, 1,1, 2,2, 3,3, 0,0, 1,1, 2,2, 3,3
|
||||
return _mm256_madd_epi16(_mm256_set1_epi16(1), p0123);
|
||||
}
|
||||
inline __m256i compute(__m256i x, __m256i y) const { return _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(x, y)); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ8_0 {
|
||||
inline __m128 prepare4(const block_q8_0 * y) {
|
||||
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
|
||||
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)y4->d));
|
||||
}
|
||||
inline __m128 prepare4(__m128 other_scales, const block_q8_0 * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_0 {
|
||||
ggml_half scales8[4];
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m128 prepare4(__m128 other_scales, const Q * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
template <int min_value>
|
||||
struct ScaleHelperQ_0_1 {
|
||||
ggml_half scales8[4];
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
|
||||
return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm_mul256_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
float d = GGML_FP16_TO_FP32(y->d);
|
||||
return std::make_pair(d, -d*float(min_value));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
const __m128 min = _mm_set1_ps(float(-min_value));
|
||||
};
|
||||
|
||||
//template <int min_value>
|
||||
//struct ScaleHelperQ_0_2 {
|
||||
// ggml_bf16_t scales8[4];
|
||||
// template <typename Q>
|
||||
// inline __m256 prepare4(const Q * y) {
|
||||
// for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
|
||||
// auto s4 = _mm_castsi128_ps(_mm_slli_epi16(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i *)scales8)), 16));
|
||||
// return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
|
||||
// }
|
||||
// template <typename Q>
|
||||
// inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
// return _mm_mul256_ps(other_scales, prepare4<Q>(y));
|
||||
// }
|
||||
// template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
// float d = GGML_BF16_TO_FP32(y->d);
|
||||
// return std::make_pair(d, -d*float(min_value));
|
||||
// }
|
||||
// std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
// return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
// }
|
||||
// const __m128 min = _mm_set1_ps(float(-min_value));
|
||||
//};
|
||||
|
||||
struct ScaleHelperQ8_1 {
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y;
|
||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)y4->d));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
};
|
||||
|
||||
struct ScaleHelperQ8_2 {
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
const block_q8_2_x4 * y4 = (const block_q8_2_x4 *)y;
|
||||
auto aux = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)y4->d));
|
||||
return _mm256_castsi256_ps(_mm256_slli_epi32(aux, 16));
|
||||
}
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_BF16_TO_FP32(y->d), GGML_BF16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s;
|
||||
return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_2 * y) const {
|
||||
ggml_bf16_t d, s; d.bits = y->d; s.bits = y->s;
|
||||
return std::make_pair(dm.first*GGML_BF16_TO_FP32(d), dm.second*GGML_BF16_TO_FP32(s));
|
||||
}
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_1 {
|
||||
uint32_t scales8[4];
|
||||
const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
|
||||
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(const Q * y) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
// it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
|
||||
// complain that this breaks strict-aliasing rules.
|
||||
memcpy(scales8 + j, &y[j].d, sizeof(uint32_t));
|
||||
}
|
||||
return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle));
|
||||
}
|
||||
|
||||
template <typename Q>
|
||||
inline __m256 prepare4(__m256 other_scales, const Q * y) {
|
||||
return _mm256_mul_ps(other_scales, prepare4<Q>(y));
|
||||
}
|
||||
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
|
||||
return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
template <typename Q> inline std::pair<float, float> prepare1(const std::pair<float, float>& dm, const Q * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m));
|
||||
}
|
||||
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
|
||||
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
|
||||
}
|
||||
};
|
||||
|
||||
struct MinusType0 {
|
||||
inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); }
|
||||
inline float compute(float d, int) const { return d; }
|
||||
inline float result(__m256 acc, int) const { return hsum_float_8(acc); }
|
||||
inline __m256 vresult(__m256 acc, int) const { return acc; }
|
||||
};
|
||||
|
||||
template <int nrc_y> struct MinusType1 {
|
||||
__m128 accm[nrc_y];
|
||||
MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); }
|
||||
inline __m256 compute(__m256 dm, int iy) {
|
||||
const __m128 d = _mm256_castps256_ps128(dm);
|
||||
const __m128 m = _mm256_extractf128_ps(dm, 1);
|
||||
accm[iy] = _mm_add_ps(accm[iy], m);
|
||||
return _mm256_set_m128(d, d);
|
||||
}
|
||||
inline float compute(const std::pair<float, float>& dm, int iy) {
|
||||
accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f));
|
||||
return dm.first;
|
||||
}
|
||||
inline float result(__m256 acc, int iy) const {
|
||||
const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
|
||||
return hsum_float_4(_mm_add_ps(sum, accm[iy]));
|
||||
}
|
||||
inline __m256 vresult(__m256 acc, int iy) const {
|
||||
return _mm256_add_ps(acc, _mm256_insertf128_ps(_mm256_setzero_ps(), accm[iy], 0));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Minus, int nrc_y, bool is_multiple_of_4> struct AccumT {
|
||||
__m256 acc[nrc_y];
|
||||
Minus accm;
|
||||
AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); }
|
||||
template <typename Unpacker, typename Scales, typename Sum, typename Q8>
|
||||
inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) {
|
||||
auto qx = unp.quants();
|
||||
__m256 dall[nrc_y];
|
||||
for (int i = 0; i < nb/4; ++i) {
|
||||
auto other_scales = unp.set_block_4(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
|
||||
dall[iy] = accm.compute(s12, iy);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto pall = sum.compute(qx, y[iy] + 4*i);
|
||||
acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
|
||||
}
|
||||
}
|
||||
if (!is_multiple_of_4) {
|
||||
for (int i = 4*(nb/4); i < nb; ++i) {
|
||||
auto other_scales = unp.set_block(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare1(other_scales, y[iy] + i);
|
||||
auto d = accm.compute(s12, iy);
|
||||
const __m256i p0 = sum.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
info.store(ix, iy, accm.result(acc[iy], iy));
|
||||
}
|
||||
}
|
||||
template <typename Unpacker, typename Scales, typename Sum, typename Q8>
|
||||
inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, __m256 * result) {
|
||||
auto qx = unp.quants();
|
||||
__m256 dall[nrc_y];
|
||||
for (int i = 0; i < nb/4; ++i) {
|
||||
auto other_scales = unp.set_block_4(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare4(other_scales, y[iy] + 4*i);
|
||||
dall[iy] = accm.compute(s12, iy);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto pall = sum.compute(qx, y[iy] + 4*i);
|
||||
acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]);
|
||||
}
|
||||
}
|
||||
if (!is_multiple_of_4) {
|
||||
for (int i = 4*(nb/4); i < nb; ++i) {
|
||||
auto other_scales = unp.set_block(i);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto s12 = scales.prepare1(other_scales, y[iy] + i);
|
||||
auto d = accm.compute(s12, iy);
|
||||
const __m256i p0 = sum.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
result[iy] = accm.vresult(acc[iy], iy);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int nrc_y, bool is_multiple_of_4>
|
||||
using AccumType0 = AccumT<MinusType0, nrc_y, is_multiple_of_4>;
|
||||
|
||||
template <int nrc_y, bool is_multiple_of_4>
|
||||
using AccumType1 = AccumT<MinusType1<nrc_y>, nrc_y, is_multiple_of_4>;
|
||||
|
||||
using Sum4TypeQ80 = Sum4<block_q8_0, block_q8_0_x4, SignedDot, false>;
|
||||
using Sum4TypeQ82 = Sum4<block_q8_2, block_q8_2_x4, UnsignedDot, false>;
|
||||
|
||||
template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y>
|
||||
void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
Scales scales;
|
||||
for (int ix = 0; ix < nrc_x; ++ix) {
|
||||
unp.set_row(ix);
|
||||
AccumType accum;
|
||||
accum.compute(nb, unp, scales, sum4, y, info, ix);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, typename AccumType, typename Scales, typename Q8, int nrc_y>
|
||||
void mul_mat_qX_q8_Helper_x2(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%2 == 0);
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
Scales scales;
|
||||
for (int ix = 0; ix < nrc_x; ix += 2) {
|
||||
unp.set_row(ix);
|
||||
AccumType accum;
|
||||
accum.compute(nb, unp, scales, sum4, y, info, ix);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_0> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, true>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType0<nrc_y, false>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
inline __m256 hsum_float_8x8(__m256 * accm) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
accm[i] = _mm256_add_ps(_mm256_permute2f128_ps(accm[i], accm[i+4], 0x20), _mm256_permute2f128_ps(accm[i], accm[i+4], 0x31));
|
||||
//accm[i] = _mm256_set_m128(_mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1)),
|
||||
// _mm_add_ps(_mm256_castps256_ps128(accm[i+0]), _mm256_extractf128_ps(accm[i+0], 1)));
|
||||
}
|
||||
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
|
||||
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y, int nrc_x>
|
||||
void mul_mat_qX_0_q8_0_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) {
|
||||
static_assert(8%nrc_y == 0);
|
||||
Q8<nrc_y, block_q8_0> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
ScaleHelperQ8_0 scales;
|
||||
__m256 result[8];
|
||||
auto store = [&info, &result] (int ix0) {
|
||||
if constexpr (nrc_y == 1) {
|
||||
info.store(ix0, 0, hsum_float_8x8(result));
|
||||
}
|
||||
else if constexpr (nrc_y == 2) {
|
||||
auto value = hsum_float_8x8(result);
|
||||
auto value1 = _mm256_extractf128_ps(value, 1);
|
||||
info.store(ix0, 0, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0x88));
|
||||
info.store(ix0, 1, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0xdd));
|
||||
}
|
||||
else {
|
||||
float val[8];
|
||||
_mm256_storeu_ps(val, hsum_float_8x8(result));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) for (int ix = 0; ix < 8/nrc_y; ++ix) info.store(ix0+ix, iy, val[nrc_y*ix+iy]);
|
||||
}
|
||||
};
|
||||
if (nb%4 == 0) {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType0<nrc_y, true> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
} else {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType0<nrc_y, false> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_1> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, true>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, false>, ScaleHelperQ8_1, block_q8_1, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y>
|
||||
void mul_mat_qX_1_q8_2_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
assert(n%Unpacker::block_size() == 0);
|
||||
Q8<nrc_y, block_q8_2> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, true>, ScaleHelperQ8_2, block_q8_2, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, AccumType1<nrc_y, false>, ScaleHelperQ8_2, block_q8_2, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Unpacker, int nrc_y, int nrc_x>
|
||||
void mul_mat_qX_0_q8_2_Tx(int n, const void * vx, size_t bx, const DataInfo& info, int) {
|
||||
static_assert(8%nrc_y == 0);
|
||||
Q8<nrc_y, block_q8_2> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
Unpacker unp(vx, bx);
|
||||
typename Unpacker::Sum4T sum4;
|
||||
ScaleHelperQ8_2 scales;
|
||||
__m256 result[8];
|
||||
auto store = [&info, &result] (int ix0) {
|
||||
if constexpr (nrc_y == 1) {
|
||||
info.store(ix0, 0, hsum_float_8x8(result));
|
||||
}
|
||||
else if constexpr (nrc_y == 2) {
|
||||
auto value = hsum_float_8x8(result);
|
||||
auto value1 = _mm256_extractf128_ps(value, 1);
|
||||
info.store(ix0, 0, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0x88));
|
||||
info.store(ix0, 1, _mm_shuffle_ps(_mm256_castps256_ps128(value), value1, 0xdd));
|
||||
}
|
||||
else {
|
||||
float val[8];
|
||||
_mm256_storeu_ps(val, hsum_float_8x8(result));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) for (int ix = 0; ix < 8/nrc_y; ++ix) info.store(ix0+ix, iy, val[nrc_y*ix+iy]);
|
||||
}
|
||||
};
|
||||
if (nb%4 == 0) {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType1<nrc_y, true> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
} else {
|
||||
for (int ix0 = 0; ix0 < nrc_x; ix0 += 8/nrc_y) {
|
||||
for (int ix = 0; ix < 8/nrc_y; ++ix) {
|
||||
unp.set_row(ix0 + ix);
|
||||
AccumType1<nrc_y, false> accum;
|
||||
accum.compute(nb, unp, scales, sum4, q8.y, result + nrc_y*ix);
|
||||
}
|
||||
store(ix0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Dequantizer4bit {
|
||||
const __m256i m4 = _mm256_set1_epi8(0xf);
|
||||
inline __m256i dequant(const uint8_t * qs) const {
|
||||
const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs);
|
||||
return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_Dequantizer {
|
||||
inline __m256i dequant(const block_q8_0 * x) const {
|
||||
return _mm256_loadu_si256((const __m256i *)x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_1_Dequantizer {
|
||||
inline __m256i dequant(const block_q8_0 * x) const {
|
||||
return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_0_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
const __m256i m8 = _mm256_set1_epi8(-8);
|
||||
inline __m256i dequant(const block_q4_0 * x) const {
|
||||
return _mm256_add_epi8(b4.dequant(x->qs), m8);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_0_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
inline __m256i dequant(const block_q4_0 * x) const {
|
||||
return b4.dequant(x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct IQ4_NL_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
const __m256i values = load_iq4nl_values_256();
|
||||
#else
|
||||
const __m256i values = load_iq4k_values_256();
|
||||
#endif
|
||||
inline __m256i dequant(const block_iq4_nl * x) const {
|
||||
return _mm256_shuffle_epi8(values, b4.dequant(x->qs));
|
||||
}
|
||||
};
|
||||
|
||||
struct Q4_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
inline __m256i dequant(const block_q4_1 * x) const {
|
||||
return b4.dequant(x->qs);
|
||||
}
|
||||
};
|
||||
|
||||
struct HBitDequantizer {
|
||||
const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
||||
const __m256i minus1 = _mm256_set1_epi64x(-1);
|
||||
inline __m256i to_bytes(const uint8_t * bits) const {
|
||||
// Note: Data in all ggml quants is at least 2-byte aligned.
|
||||
// => we can cast to uint16_t and use or on two consecutive entries
|
||||
// which is faster than memcpy
|
||||
const uint16_t * aux16 = (const uint16_t *)bits;
|
||||
const uint32_t aux32 = aux16[0] | (aux16[1] << 16);
|
||||
//uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
|
||||
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle);
|
||||
bytes = _mm256_or_si256(bytes, mask);
|
||||
return _mm256_cmpeq_epi8(bytes, minus1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q5_0_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
HBitDequantizer hbit;
|
||||
const __m256i mh = _mm256_set1_epi8((char)0xF0);
|
||||
inline __m256i dequant(const block_q5_0 * x) const {
|
||||
const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), vqh);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q5>
|
||||
struct Q5_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
HBitDequantizer hbit;
|
||||
const __m256i mh = _mm256_set1_epi8(0x10);
|
||||
inline __m256i dequant(const Q5 * x) const {
|
||||
const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), vqh);
|
||||
}
|
||||
};
|
||||
struct Q6_0_1_Dequantizer {
|
||||
Dequantizer4bit b4;
|
||||
const __m256i mh = _mm256_set1_epi8(0x30);
|
||||
const __m256i shift1 = _mm256_set_epi64x(0, 2, 0, 4);
|
||||
const __m256i shift2 = _mm256_set_epi64x(2, 0, 0, 0);
|
||||
inline __m256i dequant(const block_q6_0 * x) const {
|
||||
uint64_t aux64; std::memcpy(&aux64, x->qh, 8);
|
||||
auto h256 = _mm256_sllv_epi64(_mm256_set1_epi64x(aux64), shift1);
|
||||
return _mm256_or_si256(b4.dequant(x->qs), _mm256_and_si256(_mm256_srlv_epi64(h256, shift2), mh));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Q, typename Scales, typename Dequantizer>
|
||||
struct Q_Unpacker {
|
||||
Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {}
|
||||
|
||||
const char * cx_0;
|
||||
const Q * x;
|
||||
size_t bx;
|
||||
|
||||
Scales scales;
|
||||
Dequantizer deq;
|
||||
|
||||
__m256i qx[4];
|
||||
|
||||
inline const __m256i* quants() const { return qx; }
|
||||
|
||||
inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); }
|
||||
|
||||
inline auto set_block_4(int i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
qx[j] = deq.dequant(x + 4*i + j);
|
||||
}
|
||||
return scales.prepare4(x + 4*i);
|
||||
}
|
||||
inline auto set_block(int i) {
|
||||
qx[0] = deq.dequant(x + i);
|
||||
return scales.prepare1(x + i);
|
||||
}
|
||||
};
|
||||
|
||||
struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_Dequantizer> {
|
||||
Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK8_0; }
|
||||
};
|
||||
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
|
||||
Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK8_0; }
|
||||
};
|
||||
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
|
||||
Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK4_0; }
|
||||
};
|
||||
struct Q4_0_1_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0_1<8>, Q4_0_1_Dequantizer> {
|
||||
Q4_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
//using Sum4T = Sum4TypeQ82;
|
||||
using Sum4T = Sum4q4<block_q8_2, block_q8_2_x4>;
|
||||
inline static int block_size() { return QK4_0; }
|
||||
};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0_1<128>, IQ4_NL_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#else
|
||||
struct IQ4_NL_Unpacker final : public Q_Unpacker<block_iq4_nl, ScaleHelperQ_0, IQ4_NL_Dequantizer> {
|
||||
IQ4_NL_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK4_NL; }
|
||||
};
|
||||
#endif
|
||||
struct Q5_0_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0, Q5_0_Dequantizer> {
|
||||
Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ80;
|
||||
inline static int block_size() { return QK5_0; }
|
||||
};
|
||||
struct Q5_0_1_Unpacker final : public Q_Unpacker<block_q5_0, ScaleHelperQ_0_1<16>, Q5_1_Dequantizer<block_q5_0>> {
|
||||
Q5_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK5_0; }
|
||||
};
|
||||
struct Q4_1_Unpacker final : public Q_Unpacker<block_q4_1, ScaleHelperQ_1, Q4_1_Dequantizer> {
|
||||
Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK4_1; }
|
||||
};
|
||||
struct Q5_1_Unpacker final : public Q_Unpacker<block_q5_1, ScaleHelperQ_1, Q5_1_Dequantizer<block_q5_1>> {
|
||||
Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK5_1; }
|
||||
};
|
||||
struct Q6_0_1_Unpacker final : public Q_Unpacker<block_q6_0, ScaleHelperQ_0_1<32>, Q6_0_1_Dequantizer> {
|
||||
Q6_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
|
||||
using Sum4T = Sum4TypeQ82;
|
||||
inline static int block_size() { return QK6_0; }
|
||||
};
|
||||
|
||||
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
|
||||
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
|
||||
m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
|
||||
m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
|
||||
m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
|
||||
m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
|
||||
m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
|
||||
m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
|
||||
m.funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
m.funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
m.funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
m.funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
m.funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
m.funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, IQ4_NL_Unpacker>) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
m.funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
m.funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
m.funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
m.funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
m.funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
m.funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
#else
|
||||
m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
|
||||
m.funcs[3] = mul_mat_qX_0_q8_0_T<Dequantizer, 4>;
|
||||
m.funcs[4] = mul_mat_qX_0_q8_0_T<Dequantizer, 5>;
|
||||
m.funcs[5] = mul_mat_qX_0_q8_0_T<Dequantizer, 6>;
|
||||
m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
|
||||
m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
|
||||
#endif
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, Q8_0_1_Unpacker> || std::is_same_v<Dequantizer, Q4_0_1_Unpacker> ||
|
||||
std::is_same_v<Dequantizer, Q5_0_1_Unpacker> || std::is_same_v<Dequantizer, Q6_0_1_Unpacker>) {
|
||||
m.funcs[0] = mul_mat_qX_1_q8_2_T<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_1_q8_2_T<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_1_q8_2_T<Dequantizer, 3>;
|
||||
m.funcs[3] = mul_mat_qX_1_q8_2_T<Dequantizer, 4>;
|
||||
m.funcs[4] = mul_mat_qX_1_q8_2_T<Dequantizer, 5>;
|
||||
m.funcs[5] = mul_mat_qX_1_q8_2_T<Dequantizer, 6>;
|
||||
m.funcs[6] = mul_mat_qX_1_q8_2_T<Dequantizer, 7>;
|
||||
m.funcs[7] = mul_mat_qX_1_q8_2_T<Dequantizer, 8>;
|
||||
}
|
||||
else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ3S> || std::is_same_v<Dequantizer, DequantizerIQ3XXS> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ2S> || std::is_same_v<Dequantizer, DequantizerIQ2XS> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
|
||||
if constexpr (std::is_same_v<Dequantizer, DequantizerIQ3S> || std::is_same_v<Dequantizer, DequantizerIQ3XXS> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ2S> || std::is_same_v<Dequantizer, DequantizerIQ2XS> ||
|
||||
std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
|
||||
m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
|
||||
m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
|
||||
m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
|
||||
@@ -9415,49 +8591,13 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
expected_typeB = GGML_TYPE_Q8_K16;
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
assert (ne00 % QK4_0 == 0);
|
||||
MulMat::set_functions<Q4_0_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
assert (ne00 % QK4_1 == 0);
|
||||
MulMat::set_functions<Q4_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
assert (ne00 % QK5_0 == 0);
|
||||
MulMat::set_functions<Q5_0_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
assert (ne00 % QK5_1 == 0);
|
||||
MulMat::set_functions<Q5_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
break;
|
||||
case GGML_TYPE_Q6_0:
|
||||
assert (ne00 % QK6_0 == 0);
|
||||
MulMat::set_functions<Q6_0_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
assert (ne00 % QK8_0 == 0);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
MulMat::set_functions<Q8_0_1_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
#else
|
||||
MulMat::set_functions<Q8_0_Unpacker>(mm);
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
assert (ne00 % QK4_NL == 0);
|
||||
MulMat::set_functions<IQ4_NL_Unpacker>(mm);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
expected_typeB = GGML_TYPE_Q8_2_X4;
|
||||
#else
|
||||
expected_typeB = GGML_TYPE_Q8_0_X4;
|
||||
#endif
|
||||
break;
|
||||
return iqk_set_kernels_legacy_quants(ne00, typeA, typeB, mm.funcs);
|
||||
case GGML_TYPE_IQ4_NL_R4:
|
||||
assert (ne00 % QK4_NL == 0);
|
||||
mm.funcs[0] = mul_mat_iq4_nl_r4_q8_2<1>;
|
||||
|
||||
Reference in New Issue
Block a user