Apply platform specific modifications when repacking

On Zen4 we can pre-convert the signed quants in q8_0_r4 and
q8_k_r8 to unsigned thus avoiding these operations in matrix
multiplications. With this change we hit
PP-512 = 382.40 t/s (q8_k_r8)
PP-512 = 306.92 t/s (q8_0_r4)
for L3-8B on a Ryzen-7950X using q8_0 KV-cache.
This commit is contained in:
Iwan Kawrakow
2025-01-27 11:59:30 +02:00
parent 8b3c66063f
commit f1c114d477
2 changed files with 75 additions and 17 deletions

View File

@@ -2988,7 +2988,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
int nb = n / QK8_0;
GGML_ASSERT(nb%4 == 0);
if constexpr (nrc_y == 1) {
auto m127 = _mm256_set1_epi8(127);
__m256 acc[2] = {};
__m256i qx[8];
float d8[8];
@@ -2998,14 +2997,14 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
_mm256_storeu_ps(d8, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)q8.y[0][ib4].d)));
for (int k = 0; k < 4; ++k) {
auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq8[4*ib4+k].d));
qx[0] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0), m127);
qx[1] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1), m127);
qx[2] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2), m127);
qx[3] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3), m127);
qx[4] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4), m127);
qx[5] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5), m127);
qx[6] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6), m127);
qx[7] = _mm256_add_epi8(_mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7), m127);
qx[0] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+0);
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+1);
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+2);
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+3);
qx[4] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+4);
qx[5] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+5);
qx[6] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+6);
qx[7] = _mm256_loadu_si256((const __m256i *)iq8[4*ib4+k].qs+7);
auto y4l = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+0);
auto y4h = _mm_loadu_si128((const __m128i*)q8.y[0][ib4].qs+2*k+1);
auto yl = MM256_SET_M128I(y4l, y4l);
@@ -3031,7 +3030,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
__m512 acc[2*nrc_y] = {};
__m512i qx[8];
float d8[8*nrc_y];
auto m127 = _mm512_set1_epi8(127);
for (int ix = 0; ix < nrc_x; ix += 16) {
const block_q8_0_r8 * q8l = (const block_q8_0_r8 *)((const char *)vx + (ix+0)*bx);
const block_q8_0_r8 * q8h = (const block_q8_0_r8 *)((const char *)vx + (ix+8)*bx);
@@ -3046,7 +3044,6 @@ static void mul_mat_q8_0_r4_q8_1(int n, const void * vx, size_t bx, const DataIn
for (int j = 0; j < 8; ++j) {
qx[j] = _mm512_inserti32x8(_mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)q8l[4*ib4+k].qs+j)),
_mm256_loadu_si256((const __m256i *)q8h[4*ib4+k].qs+j), 1);
qx[j] = _mm512_add_epi8(qx[j], m127);
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto y4l = _mm_loadu_si128((const __m128i*)q8.y[iy][ib4].qs+2*k+0);
@@ -5070,12 +5067,7 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1);
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2);
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3);
#ifdef HAVE_FANCY_SIMD
qx[0] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+0), _mm256_set1_epi8(-128));
qx[1] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1), _mm256_set1_epi8(-128));
qx[2] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2), _mm256_set1_epi8(-128));
qx[3] = _mm256_xor_si256(_mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3), _mm256_set1_epi8(-128));
#else
#ifndef HAVE_FANCY_SIMD
auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
@@ -13037,6 +13029,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
m1 = _mm256_unpackhi_epi64(t0, t1);
m2 = _mm256_unpacklo_epi64(t2, t3);
m3 = _mm256_unpackhi_epi64(t2, t3);
#ifdef HAVE_FANCY_SIMD
m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
#endif
_mm256_storeu_si256((__m256i *)y[ib].qs + 0, m0);
_mm256_storeu_si256((__m256i *)y[ib].qs + 1, m1);
_mm256_storeu_si256((__m256i *)y[ib].qs + 2, m2);
@@ -13053,6 +13051,12 @@ struct HelperQ80R4 : public BaseHelper<step> {
m1 = _mm256_unpackhi_epi64(t0, t1);
m2 = _mm256_unpacklo_epi64(t2, t3);
m3 = _mm256_unpackhi_epi64(t2, t3);
#ifdef HAVE_FANCY_SIMD
m0 = _mm256_xor_si256(m0, _mm256_set1_epi8(-128));
m1 = _mm256_xor_si256(m1, _mm256_set1_epi8(-128));
m2 = _mm256_xor_si256(m2, _mm256_set1_epi8(-128));
m3 = _mm256_xor_si256(m3, _mm256_set1_epi8(-128));
#endif
_mm256_storeu_si256((__m256i *)y[ib].qs + 4, m0);
_mm256_storeu_si256((__m256i *)y[ib].qs + 5, m1);
_mm256_storeu_si256((__m256i *)y[ib].qs + 6, m2);

View File

@@ -43,6 +43,15 @@ constexpr int popcount(uint32_t x) { return __builtin_popcount(x); }
constexpr int popcount(uint64_t x) { return __builtin_popcountll(x); }
#endif
#if defined __x86_64__
#if defined HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#endif
#endif
namespace {
inline int nearest_int(float fval) {
@@ -3746,12 +3755,33 @@ static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8
y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
}
}
#ifdef HAVE_FANCY_SIMD
if (online) {
for (int l = 0; l < 4; ++l) {
auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
}
}
#endif
}
x += 8*nblock;
y += nblock;
}
}
#ifdef HAVE_FANCY_SIMD
static void modify_q8_0_r4(int64_t k, char * cy) {
auto y = (block_iq4_nl_r8 *)cy;
int nb = k/(32*8);
for (int ib = 0; ib < nb; ++ib) {
for (int l = 0; l < 4; ++l) {
auto v = _mm512_add_epi8(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(127));
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
}
}
}
#endif
size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
GGML_ASSERT(nrows%8 == 0);
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
@@ -5195,11 +5225,31 @@ static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8
for (int i = 0; i < 4; ++i) y[ibl].qs[32*ib + 4*k + i] = x8[k][ibl].qs[4*ib+i];
}
}
#ifdef HAVE_FANCY_SIMD
if (online) {
for (int l = 0; l < 32; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ibl].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[ibl].qs + l, v);
}
}
#endif
}
x += 8*nblock;
y += nblock;
}
}
#ifdef HAVE_FANCY_SIMD
static void modify_q8_k_r8(int64_t k, char * cy) {
auto y = (block_q8_k_r8 *)cy;
int nb = k/(256*8);
for (int ib = 0; ib < nb; ++ib) {
for (int l = 0; l < 32; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ib].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[ib].qs + l, v);
}
}
}
#endif
size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
GGML_ASSERT(nrows%8 == 0);
@@ -6057,6 +6107,10 @@ bool iqk_modify_tensor(struct ggml_tensor * tensor) {
static const std::unordered_map<ggml_type, Modify> k_mod_map = {
#ifdef __ARM_NEON
{ GGML_TYPE_Q4_0_R4, {modify_q4_0_r4, 8} },
#endif
#ifdef HAVE_FANCY_SIMD
{ GGML_TYPE_Q8_0_R4, {modify_q8_0_r4, 8} },
{ GGML_TYPE_Q8_K_R8, {modify_q8_k_r8, 8} },
#endif
};
auto it = k_mod_map.find(tensor->type);