mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 23:24:13 +00:00
Apply platform specific modifications when repacking
On Zen4 we can pre-convert the signed quants in q8_0_r4 and q8_k_r8 to unsigned thus avoiding these operations in matrix multiplications. With this change we hit PP-512 = 382.40 t/s (q8_k_r8) PP-512 = 306.92 t/s (q8_0_r4) for L3-8B on a Ryzen-7950X using q8_0 KV-cache.
This commit is contained in:
@@ -2988,7 +2988,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
|
||||
int nb = n / QK8_0;
|
||||
GGML_ASSERT(nb%4 == 0);
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto m127 = _mm256_set1_epi8(127);
|
||||
__m256 acc[2] = {};
|
||||
__m256i qx[8];
|
||||
float d8[8];
|
||||
@@ -2998,14 +2997,14 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
|
||||
_mm256_storeu_ps(d8, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)));
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq8[4*ib4+k].d));
|
||||
qx[0] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0), m127);
|
||||
qx[1] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1), m127);
|
||||
qx[2] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2), m127);
|
||||
qx[3] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3), m127);
|
||||
qx[4] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4), m127);
|
||||
qx[5] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5), m127);
|
||||
qx[6] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6), m127);
|
||||
qx[7] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7), m127);
|
||||
qx[0] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0);
|
||||
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1);
|
||||
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2);
|
||||
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3);
|
||||
qx[4] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4);
|
||||
qx[5] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5);
|
||||
qx[6] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6);
|
||||
qx[7] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7);
|
||||
auto y4l = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+0);
|
||||
auto y4h = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+1);
|
||||
auto yl = MM256_SET_M128I(y4l, y4l);
|
||||
@@ -3031,7 +3030,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
|
||||
__m512 acc[2*nrc_y] = {};
|
||||
__m512i qx[8];
|
||||
float d8[8*nrc_y];
|
||||
auto m127 = _mm512_set1_epi8(127);
|
||||
for (int ix = 0; ix < nrc_x; ix += 16) {
|
||||
const block_q8_0_r8 * q8l = (const block_q8_0_r8 *)((const char *)vx + (ix+0)*bx);
|
||||
const block_q8_0_r8 * q8h = (const block_q8_0_r8 *)((const char *)vx + (ix+8)*bx);
|
||||
@@ -3046,7 +3044,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[4*ib4+k].qs+j)),
|
||||
_mm256_loadu_si256((const __m256i *)q8h[4*ib4+k].qs+j), 1);
|
||||
qx[j] = _mm512_add_epi8(qx[j], m127);
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y4l = _mm_loadu_si128((const __m128i*)q8.y[iy][ib4].qs+2*k+0);
|
||||
@@ -5070,12 +5067,7 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1);
|
||||
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2);
|
||||
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
qx[0] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+0), _mm256_set1_epi8(-128));
|
||||
qx[1] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1), _mm256_set1_epi8(-128));
|
||||
qx[2] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2), _mm256_set1_epi8(-128));
|
||||
qx[3] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3), _mm256_set1_epi8(-128));
|
||||
#else
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
|
||||
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
|
||||
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
|
||||
@@ -13037,6 +13029,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
|
||||
m1 = _mm256_unpackhi_epi64(t0, t1);
|
||||
m2 = _mm256_unpacklo_epi64(t2, t3);
|
||||
m3 = _mm256_unpackhi_epi64(t2, t3);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
|
||||
m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
|
||||
m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
|
||||
m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
|
||||
#endif
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 0, m0);
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 1, m1);
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 2, m2);
|
||||
@@ -13053,6 +13051,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
|
||||
m1 = _mm256_unpackhi_epi64(t0, t1);
|
||||
m2 = _mm256_unpacklo_epi64(t2, t3);
|
||||
m3 = _mm256_unpackhi_epi64(t2, t3);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
|
||||
m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
|
||||
m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
|
||||
m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
|
||||
#endif
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 4, m0);
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 5, m1);
|
||||
_mm256_storeu_si256((__m256i *)y[ib].qs + 6, m2);
|
||||
|
||||
@@ -43,6 +43,15 @@ constexpr int popcount(uint32_t x) { return __builtin_popcount(x); }
|
||||
constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); }
|
||||
#endif
|
||||
|
||||
#if defined __x86_64__
|
||||
#if defined HAVE_FANCY_SIMD
|
||||
#undef HAVE_FANCY_SIMD
|
||||
#endif
|
||||
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
|
||||
#define HAVE_FANCY_SIMD
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
inline int nearest_int(float fval) {
|
||||
@@ -3746,12 +3755,33 @@ static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8
|
||||
y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
if (online) {
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
|
||||
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
x += 8*nblock;
|
||||
y += nblock;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
static void modify_q8_0_r4(int64_t k, char * cy) {
|
||||
auto y = (block_iq4_nl_r8 *)cy;
|
||||
int nb = k/(32*8);
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
|
||||
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||
@@ -5195,11 +5225,31 @@ static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8
|
||||
for (int i = 0; i < 4; ++i) y[ibl].qs[32*ib + 4*k + i] = x8[k][ibl].qs[4*ib+i];
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
if (online) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ibl].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[ibl].qs + l, v);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
x += 8*nblock;
|
||||
y += nblock;
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
static void modify_q8_k_r8(int64_t k, char * cy) {
|
||||
auto y = (block_q8_k_r8 *)cy;
|
||||
int nb = k/(256*8);
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
@@ -6057,6 +6107,10 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) {
|
||||
static const std::unordered_map<ggml_type, Modify> k_mod_map = {
|
||||
#ifdef __ARM_NEON
|
||||
{ GGML_TYPE_Q4_0_R4, {modify_q4_0_r4, 8} },
|
||||
#endif
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
{ GGML_TYPE_Q8_0_R4, {modify_q8_0_r4, 8} },
|
||||
{ GGML_TYPE_Q8_K_R8, {modify_q8_k_r8, 8} },
|
||||
#endif
|
||||
};
|
||||
auto it = k_mod_map.find(tensor->type);
|
||||
|
||||
Reference in New Issue
Block a user