mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 15:14:10 +00:00
Refactor iqk: Factor out GEMM for repacked i-quants
This commit is contained in:
@@ -741,6 +741,823 @@ static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataIn
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_xxs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xxs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto qs = iq2[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(iq2xxs_grid[qs[ 3]], iq2xxs_grid[qs[ 2]], iq2xxs_grid[qs[ 1]], iq2xxs_grid[qs[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xxs_grid[qs[ 7]], iq2xxs_grid[qs[ 6]], iq2xxs_grid[qs[ 5]], iq2xxs_grid[qs[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xxs_grid[qs[11]], iq2xxs_grid[qs[10]], iq2xxs_grid[qs[ 9]], iq2xxs_grid[qs[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xxs_grid[qs[15]], iq2xxs_grid[qs[14]], iq2xxs_grid[qs[13]], iq2xxs_grid[qs[12]]);
|
||||
qs += 16;
|
||||
auto sas = _mm_loadu_si128((const __m128i *)iq2[ibl].sas + ib);
|
||||
auto scales = _mm_and_si128(sas, _mm_set1_epi8(1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
scales = _mm_dpbusd_epi32(_mm_set1_epi32(1), scales, _mm_set1_epi32(0x10080402));
|
||||
#else
|
||||
scales = _mm_maddubs_epi16(scales, _mm_set1_epi32(0x10080402));
|
||||
scales = _mm_add_epi32(_mm_madd_epi16(_mm_set1_epi16(1), scales), _mm_set1_epi32(1));
|
||||
#endif
|
||||
auto scales32 = MM256_SET_M128I(scales, scales);
|
||||
auto signs128 = _mm_and_si128(sas, _mm_set1_epi8(-2)); // 0xfe = -2 as signed. Needed to shutup compiler warning.
|
||||
signs128 = _mm_xor_si128(signs128, _mm_srli_epi16(signs128, 1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y));
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y));
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y));
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1)));
|
||||
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2)));
|
||||
auto sumi3 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3)));
|
||||
auto sumi4 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4)));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_xs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
auto s_shuffle = _mm_set_epi64x(0x0f0d0b0907050301, 0x0e0c0a0806040200);
|
||||
__m256i qx[4];
|
||||
union { __m256i vec; uint16_t val[16]; } helper;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto val = _mm256_loadu_si256((const __m256i *)iq2[ibl].qs + ib);
|
||||
helper.vec = _mm256_and_si256(val, _mm256_set1_epi16(511));
|
||||
qx[0] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 3]], iq2xs_grid[helper.val[ 2]], iq2xs_grid[helper.val[ 1]], iq2xs_grid[helper.val[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 7]], iq2xs_grid[helper.val[ 6]], iq2xs_grid[helper.val[ 5]], iq2xs_grid[helper.val[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xs_grid[helper.val[11]], iq2xs_grid[helper.val[10]], iq2xs_grid[helper.val[ 9]], iq2xs_grid[helper.val[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xs_grid[helper.val[15]], iq2xs_grid[helper.val[14]], iq2xs_grid[helper.val[13]], iq2xs_grid[helper.val[12]]);
|
||||
auto signs16 = _mm256_srli_epi16(val, 9);
|
||||
signs16 = _mm256_xor_si256(signs16, _mm256_slli_epi16(signs16, 1));
|
||||
auto signs128 = _mm_or_si128(_mm256_castsi256_si128(signs16), _mm_slli_epi16(_mm256_extracti128_si256(signs16, 1), 8));
|
||||
signs128 = _mm_shuffle_epi8(signs128, s_shuffle);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y)); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y)); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y)); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y)); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
if constexpr (nrc_y == 1) {
|
||||
isum[0] = _mm256_add_epi32(isum[0], _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))));
|
||||
isum[1] = _mm256_add_epi32(isum[1], _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))));
|
||||
isum[2] = _mm256_add_epi32(isum[2], _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))));
|
||||
isum[3] = _mm256_add_epi32(isum[3], _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))));
|
||||
} else {
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_iq2_xs_r4_q8_k_16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
constexpr int nrc_y = 16;
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
auto s_shuffle = _mm_set_epi64x(0x0f0d0b0907050301, 0x0e0c0a0806040200);
|
||||
__m256i qx[4];
|
||||
union { __m256i vec; uint16_t val[16]; } helper;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
{
|
||||
auto scale_bits = _mm256_loadu_si256((const __m256i *)iq2[ibl].scales);
|
||||
auto scales1 = _mm256_and_si256(scale_bits, _mm256_set1_epi8(0xf));
|
||||
auto scales2 = _mm256_and_si256(_mm256_srli_epi16(scale_bits, 4), _mm256_set1_epi8(0xf));
|
||||
scales1 = _mm256_or_si256(_mm256_slli_epi16(scales1, 1), _mm256_set1_epi8(1));
|
||||
scales2 = _mm256_or_si256(_mm256_slli_epi16(scales2, 1), _mm256_set1_epi8(1));
|
||||
auto s1_8 = _mm256_unpacklo_epi8(scales1, scales2); // blocks 0...15, 32...47 (0...3, 8...11 from each row)
|
||||
auto s2_8 = _mm256_unpackhi_epi8(scales1, scales2); // blocks 16..31, 48...63 (4...7, 12..15 from each row)
|
||||
auto s1_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s1_8)); // 0...15 (0...3 from each row)
|
||||
auto s2_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s1_8, 1)); // 32...47 (8..11 from each row)
|
||||
auto s3_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s2_8)); // 16...31 (4...7 from each row)
|
||||
auto s4_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s2_8, 1)); // 48...63 (12.15 from each row)
|
||||
auto t1 = MM256_SET_M128I(_mm256_castsi256_si128(s2_16), _mm256_castsi256_si128(s1_16)); // 0,1 and 8,9 from each row
|
||||
auto t2 = MM256_SET_M128I(_mm256_extracti128_si256(s2_16, 1), _mm256_extracti128_si256(s1_16, 1)); // 2,3 and 10,11 from each row
|
||||
auto t3 = MM256_SET_M128I(_mm256_castsi256_si128(s4_16), _mm256_castsi256_si128(s3_16)); // 4,5 and 12,13 from each row
|
||||
auto t4 = MM256_SET_M128I(_mm256_extracti128_si256(s4_16, 1), _mm256_extracti128_si256(s3_16, 1)); // 6,7 and 14,15 from each row
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto bsums = q8.load_bsums(iy, ibl);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t1, _mm256_shuffle_epi32(bsums, 0x00));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t2, _mm256_shuffle_epi32(bsums, 0x55));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t3, _mm256_shuffle_epi32(bsums, 0xaa));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t4, _mm256_shuffle_epi32(bsums, 0xff));
|
||||
#else
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t1, _mm256_shuffle_epi32(bsums, 0x00)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t2, _mm256_shuffle_epi32(bsums, 0x55)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t3, _mm256_shuffle_epi32(bsums, 0xaa)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t4, _mm256_shuffle_epi32(bsums, 0xff)));
|
||||
#endif
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(-64.f*q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
}
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto val = _mm256_loadu_si256((const __m256i *)iq2[ibl].qs + ib);
|
||||
helper.vec = _mm256_and_si256(val, _mm256_set1_epi16(511));
|
||||
qx[0] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 3]], iq2xs_grid[helper.val[ 2]], iq2xs_grid[helper.val[ 1]], iq2xs_grid[helper.val[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 7]], iq2xs_grid[helper.val[ 6]], iq2xs_grid[helper.val[ 5]], iq2xs_grid[helper.val[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xs_grid[helper.val[11]], iq2xs_grid[helper.val[10]], iq2xs_grid[helper.val[ 9]], iq2xs_grid[helper.val[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xs_grid[helper.val[15]], iq2xs_grid[helper.val[14]], iq2xs_grid[helper.val[13]], iq2xs_grid[helper.val[12]]);
|
||||
auto signs16 = _mm256_srli_epi16(val, 9);
|
||||
signs16 = _mm256_xor_si256(signs16, _mm256_slli_epi16(signs16, 1));
|
||||
auto signs128 = _mm_or_si128(_mm256_castsi256_si128(signs16), _mm_slli_epi16(_mm256_extracti128_si256(signs16, 1), 8));
|
||||
signs128 = _mm_shuffle_epi8(signs128, s_shuffle);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[0], mask[0], _mm256_setzero_si256(), qx[0]));
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[1], mask[1], _mm256_setzero_si256(), qx[1]));
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[2], mask[2], _mm256_setzero_si256(), qx[2]));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[3], mask[3], _mm256_setzero_si256(), qx[3]));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], y); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], y); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], y); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], y); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[0], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[1], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[2], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[3], s));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], y)); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], y)); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], y)); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], y)); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
__m256i qx[4];
|
||||
auto grid = iq2s_grid;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
auto ql = iq2[ibl].qs;
|
||||
auto qh = iq2[ibl].qh;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(grid[ql[ 3] | ((qh[0] << 2) & 0x300)], grid[ql[ 2] | ((qh[0] << 4) & 0x300)], grid[ql[ 1] | ((qh[0] << 6) & 0x300)], grid[ql[ 0] | ((qh[0] << 8) & 0x300)]);
|
||||
qx[1] = _mm256_set_epi64x(grid[ql[ 7] | ((qh[1] << 2) & 0x300)], grid[ql[ 6] | ((qh[1] << 4) & 0x300)], grid[ql[ 5] | ((qh[1] << 6) & 0x300)], grid[ql[ 4] | ((qh[1] << 8) & 0x300)]);
|
||||
qx[2] = _mm256_set_epi64x(grid[ql[11] | ((qh[2] << 2) & 0x300)], grid[ql[10] | ((qh[2] << 4) & 0x300)], grid[ql[ 9] | ((qh[2] << 6) & 0x300)], grid[ql[ 8] | ((qh[2] << 8) & 0x300)]);
|
||||
qx[3] = _mm256_set_epi64x(grid[ql[15] | ((qh[3] << 2) & 0x300)], grid[ql[14] | ((qh[3] << 4) & 0x300)], grid[ql[13] | ((qh[3] << 6) & 0x300)], grid[ql[12] | ((qh[3] << 8) & 0x300)]);
|
||||
ql += 16; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq2[ibl].signs + ib);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y)); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y)); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y)); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y)); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
if constexpr (nrc_y == 1) {
|
||||
isum[0] = _mm256_add_epi32(isum[0], _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))));
|
||||
isum[1] = _mm256_add_epi32(isum[1], _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))));
|
||||
isum[2] = _mm256_add_epi32(isum[2], _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))));
|
||||
isum[3] = _mm256_add_epi32(isum[3], _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))));
|
||||
} else {
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_iq2_s_r4_q8_k_16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
constexpr int nrc_y = 16;
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
__m256i qx[4];
|
||||
auto grid = iq2s_grid;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
auto ql = iq2[ibl].qs;
|
||||
auto qh = iq2[ibl].qh;
|
||||
{
|
||||
auto scale_bits = _mm256_loadu_si256((const __m256i *)iq2[ibl].scales);
|
||||
auto scales1 = _mm256_and_si256(scale_bits, _mm256_set1_epi8(0xf));
|
||||
auto scales2 = _mm256_and_si256(_mm256_srli_epi16(scale_bits, 4), _mm256_set1_epi8(0xf));
|
||||
scales1 = _mm256_or_si256(_mm256_slli_epi16(scales1, 1), _mm256_set1_epi8(1));
|
||||
scales2 = _mm256_or_si256(_mm256_slli_epi16(scales2, 1), _mm256_set1_epi8(1));
|
||||
auto s1_8 = _mm256_unpacklo_epi8(scales1, scales2); // blocks 0...15, 32...47 (0...3, 8...11 from each row)
|
||||
auto s2_8 = _mm256_unpackhi_epi8(scales1, scales2); // blocks 16..31, 48...63 (4...7, 12..15 from each row)
|
||||
auto s1_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s1_8)); // 0...15 (0...3 from each row)
|
||||
auto s2_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s1_8, 1)); // 32...47 (8..11 from each row)
|
||||
auto s3_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s2_8)); // 16...31 (4...7 from each row)
|
||||
auto s4_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s2_8, 1)); // 48...63 (12.15 from each row)
|
||||
auto t1 = MM256_SET_M128I(_mm256_castsi256_si128(s2_16), _mm256_castsi256_si128(s1_16)); // 0,1 and 8,9 from each row
|
||||
auto t2 = MM256_SET_M128I(_mm256_extracti128_si256(s2_16, 1), _mm256_extracti128_si256(s1_16, 1)); // 2,3 and 10,11 from each row
|
||||
auto t3 = MM256_SET_M128I(_mm256_castsi256_si128(s4_16), _mm256_castsi256_si128(s3_16)); // 4,5 and 12,13 from each row
|
||||
auto t4 = MM256_SET_M128I(_mm256_extracti128_si256(s4_16, 1), _mm256_extracti128_si256(s3_16, 1)); // 6,7 and 14,15 from each row
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto bsums = q8.load_bsums(iy, ibl);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t1, _mm256_shuffle_epi32(bsums, 0x00));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t2, _mm256_shuffle_epi32(bsums, 0x55));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t3, _mm256_shuffle_epi32(bsums, 0xaa));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t4, _mm256_shuffle_epi32(bsums, 0xff));
|
||||
#else
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t1, _mm256_shuffle_epi32(bsums, 0x00)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t2, _mm256_shuffle_epi32(bsums, 0x55)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t3, _mm256_shuffle_epi32(bsums, 0xaa)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t4, _mm256_shuffle_epi32(bsums, 0xff)));
|
||||
#endif
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(-64.f*q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
}
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(grid[ql[ 3] | ((qh[0] << 2) & 0x300)], grid[ql[ 2] | ((qh[0] << 4) & 0x300)], grid[ql[ 1] | ((qh[0] << 6) & 0x300)], grid[ql[ 0] | ((qh[0] << 8) & 0x300)]);
|
||||
qx[1] = _mm256_set_epi64x(grid[ql[ 7] | ((qh[1] << 2) & 0x300)], grid[ql[ 6] | ((qh[1] << 4) & 0x300)], grid[ql[ 5] | ((qh[1] << 6) & 0x300)], grid[ql[ 4] | ((qh[1] << 8) & 0x300)]);
|
||||
qx[2] = _mm256_set_epi64x(grid[ql[11] | ((qh[2] << 2) & 0x300)], grid[ql[10] | ((qh[2] << 4) & 0x300)], grid[ql[ 9] | ((qh[2] << 6) & 0x300)], grid[ql[ 8] | ((qh[2] << 8) & 0x300)]);
|
||||
qx[3] = _mm256_set_epi64x(grid[ql[15] | ((qh[3] << 2) & 0x300)], grid[ql[14] | ((qh[3] << 4) & 0x300)], grid[ql[13] | ((qh[3] << 6) & 0x300)], grid[ql[12] | ((qh[3] << 8) & 0x300)]);
|
||||
ql += 16; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq2[ibl].signs + ib);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[0], mask[0], _mm256_setzero_si256(), qx[0]));
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[1], mask[1], _mm256_setzero_si256(), qx[1]));
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[2], mask[2], _mm256_setzero_si256(), qx[2]));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[3], mask[3], _mm256_setzero_si256(), qx[3]));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], y); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], y); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], y); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], y); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[0], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[1], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[2], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[3], s));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], y)); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], y)); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], y)); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], y)); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq3_xxs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq3 = (const block_iq3_xxs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_mul_ps(_mm_set1_ps(0.25f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq3[ibl].d))); // TODO: absorb the 0.25 factor into d when quantizing/repacking
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+ 7]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 6]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 5]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 4]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+ 3]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 2]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 1]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 0]]);
|
||||
qx[1] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+15]], iq3xxs_grid[iq3[ibl].qs[32*ib+14]], iq3xxs_grid[iq3[ibl].qs[32*ib+13]], iq3xxs_grid[iq3[ibl].qs[32*ib+12]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+11]], iq3xxs_grid[iq3[ibl].qs[32*ib+10]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 9]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 8]]);
|
||||
qx[2] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+23]], iq3xxs_grid[iq3[ibl].qs[32*ib+22]], iq3xxs_grid[iq3[ibl].qs[32*ib+21]], iq3xxs_grid[iq3[ibl].qs[32*ib+20]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+19]], iq3xxs_grid[iq3[ibl].qs[32*ib+18]], iq3xxs_grid[iq3[ibl].qs[32*ib+17]], iq3xxs_grid[iq3[ibl].qs[32*ib+16]]);
|
||||
qx[3] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+31]], iq3xxs_grid[iq3[ibl].qs[32*ib+30]], iq3xxs_grid[iq3[ibl].qs[32*ib+29]], iq3xxs_grid[iq3[ibl].qs[32*ib+28]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+27]], iq3xxs_grid[iq3[ibl].qs[32*ib+26]], iq3xxs_grid[iq3[ibl].qs[32*ib+25]], iq3xxs_grid[iq3[ibl].qs[32*ib+24]]);
|
||||
auto sas = _mm_loadu_si128((const __m128i *)iq3[ibl].sas + ib);
|
||||
auto scales = _mm_and_si128(sas, _mm_set1_epi8(1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
scales = _mm_dpbusd_epi32(_mm_set1_epi32(1), scales, _mm_set1_epi32(0x10080402));
|
||||
#else
|
||||
scales = _mm_maddubs_epi16(scales, _mm_set1_epi32(0x10080402));
|
||||
scales = _mm_add_epi32(_mm_madd_epi16(_mm_set1_epi16(1), scales), _mm_set1_epi32(1));
|
||||
//auto t1 = _mm_or_si128(_mm_and_si128(scales, _mm_set1_epi32(0x00000001)), _mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x00000100)), 7));
|
||||
//auto t2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x00010000)), 14), _mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x01000000)), 21));
|
||||
//scales = _mm_or_si128(_mm_slli_epi32(_mm_or_si128(t1, t2), 1), _mm_set1_epi32(1));
|
||||
#endif
|
||||
auto scales32 = MM256_SET_M128I(scales, scales);
|
||||
auto signs128 = _mm_and_si128(sas, _mm_set1_epi8(-2)); // 0xfe = -2 as signed. Needed to shutup compiler warning.
|
||||
signs128 = _mm_xor_si128(signs128, _mm_srli_epi16(signs128, 1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y));
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y));
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y));
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1)));
|
||||
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2)));
|
||||
auto sumi3 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3)));
|
||||
auto sumi4 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4)));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, sum);
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq3_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
auto smask = _mm256_set1_epi8(1);
|
||||
union { __m256i vec; uint32_t val[8]; } helper;
|
||||
union { __m128i vec; uint16_t val[8]; } hidx;
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__mmask32 mask[4];
|
||||
#endif
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq3 = (const block_iq3_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq3[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto qs = iq3[ibl].qs;
|
||||
auto qh = iq3[ibl].qh;
|
||||
auto scale_bits = _mm_loadu_si128((const __m128i *)iq3[ibl].scales);
|
||||
auto scales8 = MM256_SET_M128I(_mm_srli_epi16(scale_bits, 4), scale_bits);
|
||||
helper.vec = _mm256_or_si256(_mm256_slli_epi16(_mm256_and_si256(scales8, _mm256_set1_epi8(0xf)), 1), _mm256_set1_epi8(1));
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto qh32 = (const uint32_t *)qh;
|
||||
auto idx_h = _mm_sllv_epi64(_mm_cvtepu8_epi16(_mm_set1_epi32(qh32[0])), _mm_set_epi64x(4, 8));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto idx_l = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(qs + 8*i)));
|
||||
hidx.vec = _mm_or_si128(idx_l, _mm_and_si128(idx_h, _mm_set1_epi16(0x100))); idx_h = _mm_srli_epi16(idx_h, 1);
|
||||
qx[i] = _mm256_set_epi32(iq3s_grid[hidx.val[7]], iq3s_grid[hidx.val[6]], iq3s_grid[hidx.val[5]], iq3s_grid[hidx.val[4]],
|
||||
iq3s_grid[hidx.val[3]], iq3s_grid[hidx.val[2]], iq3s_grid[hidx.val[1]], iq3s_grid[hidx.val[0]]);
|
||||
}
|
||||
qs += 32; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq3[ibl].signs + ib);
|
||||
auto signs = MM256_SET_M128I(_mm_srli_epi16(signs128, 4), signs128);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto scales = _mm256_cvtepi8_epi32(_mm_set1_epi32(helper.val[ib]));
|
||||
mask[0] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[1] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[2] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[3] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
auto ys = _mm256_shuffle_epi32(y, 0x00);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[0], _mm256_mask_sub_epi8(ys, mask[0], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0x55);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[1], _mm256_mask_sub_epi8(ys, mask[1], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0xaa);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[2], _mm256_mask_sub_epi8(ys, mask[2], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0xff);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[3], _mm256_mask_sub_epi8(ys, mask[3], _mm256_setzero_si256(), ys));
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(sumi, scales));
|
||||
}
|
||||
#else
|
||||
auto scales16 = _mm256_cvtepi8_epi16(_mm_set1_epi32(helper.val[ib]));
|
||||
auto scales = _mm256_unpacklo_epi16(scales16, scales16);
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), s1)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), s2)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), s3)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), s4)));
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_madd_epi16(scales, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, sum);
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX_NY>& funcs) {
|
||||
funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
|
||||
funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
|
||||
@@ -754,27 +1571,53 @@ template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX
|
||||
|
||||
} // namespace
|
||||
|
||||
bool iqk_set_kernels_iquants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels) {
|
||||
bool iqk_set_kernels_iquants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels, mul_mat_t& func16) {
|
||||
|
||||
if (ne00%QK_K != 0 || ggml_type(typeB) != GGML_TYPE_Q8_K) {
|
||||
return false;
|
||||
}
|
||||
|
||||
func16 = nullptr;
|
||||
|
||||
switch (typeA) {
|
||||
case GGML_TYPE_IQ3_S:
|
||||
set_functions<DequantizerIQ3S>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
set_functions<DequantizerIQ3XXS>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
set_functions<DequantizerIQ2S>(kernels);
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
set_functions<DequantizerIQ2XXS>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
set_functions<DequantizerIQ2XS>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
set_functions<DequantizerIQ2XXS>(kernels);
|
||||
case GGML_TYPE_IQ2_S:
|
||||
set_functions<DequantizerIQ2S>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
set_functions<DequantizerIQ3XXS>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
set_functions<DequantizerIQ3S>(kernels);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq2_xxs_r4_q8_k, kernels);
|
||||
func16 = mul_mat_iq2_xxs_r4_q8_k<16>;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq2_xs_r4_q8_k, kernels);
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
// For some reason Zen4 does not like this particular function
|
||||
func16 = mul_mat_iq2_xs_r4_q8_k_16;
|
||||
#endif
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S_R4:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq2_s_r4_q8_k, kernels);
|
||||
func16 = mul_mat_iq2_s_r4_q8_k_16;
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS_R4:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq3_xxs_r4_q8_k, kernels);
|
||||
func16 = mul_mat_iq3_xxs_r4_q8_k<16>;
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S_R4:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq3_s_r4_q8_k, kernels);
|
||||
func16 = mul_mat_iq3_s_r4_q8_k<16>;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -6,6 +6,6 @@
|
||||
|
||||
#include <array>
|
||||
|
||||
bool iqk_set_kernels_iquants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels);
|
||||
bool iqk_set_kernels_iquants(int ne00, int typeA, int typeB, std::array<mul_mat_t, IQK_MAX_NY>& kernels, mul_mat_t& func16);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1289,867 +1289,6 @@ static void mul_mat_iq4_ks_r4_q8_k(int n, const void * vx, size_t bx, const Data
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_xxs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xxs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto qs = iq2[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(iq2xxs_grid[qs[ 3]], iq2xxs_grid[qs[ 2]], iq2xxs_grid[qs[ 1]], iq2xxs_grid[qs[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xxs_grid[qs[ 7]], iq2xxs_grid[qs[ 6]], iq2xxs_grid[qs[ 5]], iq2xxs_grid[qs[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xxs_grid[qs[11]], iq2xxs_grid[qs[10]], iq2xxs_grid[qs[ 9]], iq2xxs_grid[qs[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xxs_grid[qs[15]], iq2xxs_grid[qs[14]], iq2xxs_grid[qs[13]], iq2xxs_grid[qs[12]]);
|
||||
qs += 16;
|
||||
auto sas = _mm_loadu_si128((const __m128i *)iq2[ibl].sas + ib);
|
||||
auto scales = _mm_and_si128(sas, _mm_set1_epi8(1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
scales = _mm_dpbusd_epi32(_mm_set1_epi32(1), scales, _mm_set1_epi32(0x10080402));
|
||||
#else
|
||||
scales = _mm_maddubs_epi16(scales, _mm_set1_epi32(0x10080402));
|
||||
scales = _mm_add_epi32(_mm_madd_epi16(_mm_set1_epi16(1), scales), _mm_set1_epi32(1));
|
||||
#endif
|
||||
auto scales32 = MM256_SET_M128I(scales, scales);
|
||||
auto signs128 = _mm_and_si128(sas, _mm_set1_epi8(-2)); // 0xfe = -2 as signed. Needed to shutup compiler warning.
|
||||
signs128 = _mm_xor_si128(signs128, _mm_srli_epi16(signs128, 1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y));
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y));
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y));
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1)));
|
||||
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2)));
|
||||
auto sumi3 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3)));
|
||||
auto sumi4 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4)));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_xs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
auto s_shuffle = _mm_set_epi64x(0x0f0d0b0907050301, 0x0e0c0a0806040200);
|
||||
__m256i qx[4];
|
||||
union { __m256i vec; uint16_t val[16]; } helper;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto val = _mm256_loadu_si256((const __m256i *)iq2[ibl].qs + ib);
|
||||
helper.vec = _mm256_and_si256(val, _mm256_set1_epi16(511));
|
||||
qx[0] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 3]], iq2xs_grid[helper.val[ 2]], iq2xs_grid[helper.val[ 1]], iq2xs_grid[helper.val[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 7]], iq2xs_grid[helper.val[ 6]], iq2xs_grid[helper.val[ 5]], iq2xs_grid[helper.val[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xs_grid[helper.val[11]], iq2xs_grid[helper.val[10]], iq2xs_grid[helper.val[ 9]], iq2xs_grid[helper.val[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xs_grid[helper.val[15]], iq2xs_grid[helper.val[14]], iq2xs_grid[helper.val[13]], iq2xs_grid[helper.val[12]]);
|
||||
auto signs16 = _mm256_srli_epi16(val, 9);
|
||||
signs16 = _mm256_xor_si256(signs16, _mm256_slli_epi16(signs16, 1));
|
||||
auto signs128 = _mm_or_si128(_mm256_castsi256_si128(signs16), _mm_slli_epi16(_mm256_extracti128_si256(signs16, 1), 8));
|
||||
signs128 = _mm_shuffle_epi8(signs128, s_shuffle);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y)); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y)); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y)); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y)); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
if constexpr (nrc_y == 1) {
|
||||
isum[0] = _mm256_add_epi32(isum[0], _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))));
|
||||
isum[1] = _mm256_add_epi32(isum[1], _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))));
|
||||
isum[2] = _mm256_add_epi32(isum[2], _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))));
|
||||
isum[3] = _mm256_add_epi32(isum[3], _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))));
|
||||
} else {
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_iq2_xs_r4_q8_k_16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
constexpr int nrc_y = 16;
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
auto s_shuffle = _mm_set_epi64x(0x0f0d0b0907050301, 0x0e0c0a0806040200);
|
||||
__m256i qx[4];
|
||||
union { __m256i vec; uint16_t val[16]; } helper;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_xs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
{
|
||||
auto scale_bits = _mm256_loadu_si256((const __m256i *)iq2[ibl].scales);
|
||||
auto scales1 = _mm256_and_si256(scale_bits, _mm256_set1_epi8(0xf));
|
||||
auto scales2 = _mm256_and_si256(_mm256_srli_epi16(scale_bits, 4), _mm256_set1_epi8(0xf));
|
||||
scales1 = _mm256_or_si256(_mm256_slli_epi16(scales1, 1), _mm256_set1_epi8(1));
|
||||
scales2 = _mm256_or_si256(_mm256_slli_epi16(scales2, 1), _mm256_set1_epi8(1));
|
||||
auto s1_8 = _mm256_unpacklo_epi8(scales1, scales2); // blocks 0...15, 32...47 (0...3, 8...11 from each row)
|
||||
auto s2_8 = _mm256_unpackhi_epi8(scales1, scales2); // blocks 16..31, 48...63 (4...7, 12..15 from each row)
|
||||
auto s1_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s1_8)); // 0...15 (0...3 from each row)
|
||||
auto s2_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s1_8, 1)); // 32...47 (8..11 from each row)
|
||||
auto s3_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s2_8)); // 16...31 (4...7 from each row)
|
||||
auto s4_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s2_8, 1)); // 48...63 (12.15 from each row)
|
||||
auto t1 = MM256_SET_M128I(_mm256_castsi256_si128(s2_16), _mm256_castsi256_si128(s1_16)); // 0,1 and 8,9 from each row
|
||||
auto t2 = MM256_SET_M128I(_mm256_extracti128_si256(s2_16, 1), _mm256_extracti128_si256(s1_16, 1)); // 2,3 and 10,11 from each row
|
||||
auto t3 = MM256_SET_M128I(_mm256_castsi256_si128(s4_16), _mm256_castsi256_si128(s3_16)); // 4,5 and 12,13 from each row
|
||||
auto t4 = MM256_SET_M128I(_mm256_extracti128_si256(s4_16, 1), _mm256_extracti128_si256(s3_16, 1)); // 6,7 and 14,15 from each row
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto bsums = q8.load_bsums(iy, ibl);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t1, _mm256_shuffle_epi32(bsums, 0x00));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t2, _mm256_shuffle_epi32(bsums, 0x55));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t3, _mm256_shuffle_epi32(bsums, 0xaa));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t4, _mm256_shuffle_epi32(bsums, 0xff));
|
||||
#else
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t1, _mm256_shuffle_epi32(bsums, 0x00)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t2, _mm256_shuffle_epi32(bsums, 0x55)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t3, _mm256_shuffle_epi32(bsums, 0xaa)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t4, _mm256_shuffle_epi32(bsums, 0xff)));
|
||||
#endif
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(-64.f*q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
}
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto val = _mm256_loadu_si256((const __m256i *)iq2[ibl].qs + ib);
|
||||
helper.vec = _mm256_and_si256(val, _mm256_set1_epi16(511));
|
||||
qx[0] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 3]], iq2xs_grid[helper.val[ 2]], iq2xs_grid[helper.val[ 1]], iq2xs_grid[helper.val[ 0]]);
|
||||
qx[1] = _mm256_set_epi64x(iq2xs_grid[helper.val[ 7]], iq2xs_grid[helper.val[ 6]], iq2xs_grid[helper.val[ 5]], iq2xs_grid[helper.val[ 4]]);
|
||||
qx[2] = _mm256_set_epi64x(iq2xs_grid[helper.val[11]], iq2xs_grid[helper.val[10]], iq2xs_grid[helper.val[ 9]], iq2xs_grid[helper.val[ 8]]);
|
||||
qx[3] = _mm256_set_epi64x(iq2xs_grid[helper.val[15]], iq2xs_grid[helper.val[14]], iq2xs_grid[helper.val[13]], iq2xs_grid[helper.val[12]]);
|
||||
auto signs16 = _mm256_srli_epi16(val, 9);
|
||||
signs16 = _mm256_xor_si256(signs16, _mm256_slli_epi16(signs16, 1));
|
||||
auto signs128 = _mm_or_si128(_mm256_castsi256_si128(signs16), _mm_slli_epi16(_mm256_extracti128_si256(signs16, 1), 8));
|
||||
signs128 = _mm_shuffle_epi8(signs128, s_shuffle);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[0], mask[0], _mm256_setzero_si256(), qx[0]));
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[1], mask[1], _mm256_setzero_si256(), qx[1]));
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[2], mask[2], _mm256_setzero_si256(), qx[2]));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[3], mask[3], _mm256_setzero_si256(), qx[3]));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], y); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], y); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], y); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], y); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[0], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[1], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[2], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[3], s));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], y)); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], y)); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], y)); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], y)); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq2_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
__m256i qx[4];
|
||||
auto grid = iq2s_grid;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
auto ql = iq2[ibl].qs;
|
||||
auto qh = iq2[ibl].qh;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(grid[ql[ 3] | ((qh[0] << 2) & 0x300)], grid[ql[ 2] | ((qh[0] << 4) & 0x300)], grid[ql[ 1] | ((qh[0] << 6) & 0x300)], grid[ql[ 0] | ((qh[0] << 8) & 0x300)]);
|
||||
qx[1] = _mm256_set_epi64x(grid[ql[ 7] | ((qh[1] << 2) & 0x300)], grid[ql[ 6] | ((qh[1] << 4) & 0x300)], grid[ql[ 5] | ((qh[1] << 6) & 0x300)], grid[ql[ 4] | ((qh[1] << 8) & 0x300)]);
|
||||
qx[2] = _mm256_set_epi64x(grid[ql[11] | ((qh[2] << 2) & 0x300)], grid[ql[10] | ((qh[2] << 4) & 0x300)], grid[ql[ 9] | ((qh[2] << 6) & 0x300)], grid[ql[ 8] | ((qh[2] << 8) & 0x300)]);
|
||||
qx[3] = _mm256_set_epi64x(grid[ql[15] | ((qh[3] << 2) & 0x300)], grid[ql[14] | ((qh[3] << 4) & 0x300)], grid[ql[13] | ((qh[3] << 6) & 0x300)], grid[ql[12] | ((qh[3] << 8) & 0x300)]);
|
||||
ql += 16; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq2[ibl].signs + ib);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y)); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y)); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y)); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y)); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
if constexpr (nrc_y == 1) {
|
||||
isum[0] = _mm256_add_epi32(isum[0], _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))));
|
||||
isum[1] = _mm256_add_epi32(isum[1], _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))));
|
||||
isum[2] = _mm256_add_epi32(isum[2], _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))));
|
||||
isum[3] = _mm256_add_epi32(isum[3], _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))));
|
||||
} else {
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1))); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2))); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3))); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4))); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
if constexpr (nrc_y == 1) {
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[0], isum[1]), _mm256_unpackhi_epi32(isum[0], isum[1]));
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(isum[2], isum[3]), _mm256_unpackhi_epi32(isum[2], isum[3]));
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34));
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[0] = isum[1] = isum[2] = isum[3] = _mm256_setzero_si256();
|
||||
} else {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_iq2_s_r4_q8_k_16(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
constexpr int nrc_y = 16;
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i shuffles[2] = {
|
||||
_mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100),
|
||||
_mm256_set_epi64x(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908)
|
||||
};
|
||||
__m256i isum[2*nrc_y] = {};
|
||||
#else
|
||||
__m256i shuffles[4] = {
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0302), _mm_set1_epi16(0x0100)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0706), _mm_set1_epi16(0x0504)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0b0a), _mm_set1_epi16(0x0908)),
|
||||
MM256_SET_M128I(_mm_set1_epi16(0x0f0e), _mm_set1_epi16(0x0d0c)),
|
||||
};
|
||||
__m256i isum[nrc_y == 1 ? 4 : nrc_y] = {};
|
||||
#endif
|
||||
__m256i qx[4];
|
||||
auto grid = iq2s_grid;
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq2 = (const block_iq2_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq2[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto s32 = (const uint32_t *)iq2[ibl].scales;
|
||||
auto ql = iq2[ibl].qs;
|
||||
auto qh = iq2[ibl].qh;
|
||||
{
|
||||
auto scale_bits = _mm256_loadu_si256((const __m256i *)iq2[ibl].scales);
|
||||
auto scales1 = _mm256_and_si256(scale_bits, _mm256_set1_epi8(0xf));
|
||||
auto scales2 = _mm256_and_si256(_mm256_srli_epi16(scale_bits, 4), _mm256_set1_epi8(0xf));
|
||||
scales1 = _mm256_or_si256(_mm256_slli_epi16(scales1, 1), _mm256_set1_epi8(1));
|
||||
scales2 = _mm256_or_si256(_mm256_slli_epi16(scales2, 1), _mm256_set1_epi8(1));
|
||||
auto s1_8 = _mm256_unpacklo_epi8(scales1, scales2); // blocks 0...15, 32...47 (0...3, 8...11 from each row)
|
||||
auto s2_8 = _mm256_unpackhi_epi8(scales1, scales2); // blocks 16..31, 48...63 (4...7, 12..15 from each row)
|
||||
auto s1_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s1_8)); // 0...15 (0...3 from each row)
|
||||
auto s2_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s1_8, 1)); // 32...47 (8..11 from each row)
|
||||
auto s3_16 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(s2_8)); // 16...31 (4...7 from each row)
|
||||
auto s4_16 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(s2_8, 1)); // 48...63 (12.15 from each row)
|
||||
auto t1 = MM256_SET_M128I(_mm256_castsi256_si128(s2_16), _mm256_castsi256_si128(s1_16)); // 0,1 and 8,9 from each row
|
||||
auto t2 = MM256_SET_M128I(_mm256_extracti128_si256(s2_16, 1), _mm256_extracti128_si256(s1_16, 1)); // 2,3 and 10,11 from each row
|
||||
auto t3 = MM256_SET_M128I(_mm256_castsi256_si128(s4_16), _mm256_castsi256_si128(s3_16)); // 4,5 and 12,13 from each row
|
||||
auto t4 = MM256_SET_M128I(_mm256_extracti128_si256(s4_16, 1), _mm256_extracti128_si256(s3_16, 1)); // 6,7 and 14,15 from each row
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto bsums = q8.load_bsums(iy, ibl);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t1, _mm256_shuffle_epi32(bsums, 0x00));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t2, _mm256_shuffle_epi32(bsums, 0x55));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t3, _mm256_shuffle_epi32(bsums, 0xaa));
|
||||
sumi = _mm256_dpwssd_epi32(sumi, t4, _mm256_shuffle_epi32(bsums, 0xff));
|
||||
#else
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t1, _mm256_shuffle_epi32(bsums, 0x00)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t2, _mm256_shuffle_epi32(bsums, 0x55)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t3, _mm256_shuffle_epi32(bsums, 0xaa)));
|
||||
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(t4, _mm256_shuffle_epi32(bsums, 0xff)));
|
||||
#endif
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(-64.f*q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
}
|
||||
}
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi64x(grid[ql[ 3] | ((qh[0] << 2) & 0x300)], grid[ql[ 2] | ((qh[0] << 4) & 0x300)], grid[ql[ 1] | ((qh[0] << 6) & 0x300)], grid[ql[ 0] | ((qh[0] << 8) & 0x300)]);
|
||||
qx[1] = _mm256_set_epi64x(grid[ql[ 7] | ((qh[1] << 2) & 0x300)], grid[ql[ 6] | ((qh[1] << 4) & 0x300)], grid[ql[ 5] | ((qh[1] << 6) & 0x300)], grid[ql[ 4] | ((qh[1] << 8) & 0x300)]);
|
||||
qx[2] = _mm256_set_epi64x(grid[ql[11] | ((qh[2] << 2) & 0x300)], grid[ql[10] | ((qh[2] << 4) & 0x300)], grid[ql[ 9] | ((qh[2] << 6) & 0x300)], grid[ql[ 8] | ((qh[2] << 8) & 0x300)]);
|
||||
qx[3] = _mm256_set_epi64x(grid[ql[15] | ((qh[3] << 2) & 0x300)], grid[ql[14] | ((qh[3] << 4) & 0x300)], grid[ql[13] | ((qh[3] << 6) & 0x300)], grid[ql[12] | ((qh[3] << 8) & 0x300)]);
|
||||
ql += 16; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq2[ibl].signs + ib);
|
||||
auto scales = _mm_set1_epi32(s32[ib]);
|
||||
scales = _mm_and_si128(_mm_unpacklo_epi8(scales, _mm_srli_epi16(scales, 4)), _mm_set1_epi8(0xf));
|
||||
scales = _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi8(1));
|
||||
auto scales16 = _mm256_cvtepi8_epi16(scales); // 0...7, 0...7
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__m256i scs[2] = { _mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]) };
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[0], mask[0], _mm256_setzero_si256(), qx[0]));
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[1], mask[1], _mm256_setzero_si256(), qx[1]));
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[2], mask[2], _mm256_setzero_si256(), qx[2]));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_mask_sub_epi8(qx[3], mask[3], _mm256_setzero_si256(), qx[3]));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], y); // blocks: 0,0,0,0, 1,1,1,1, row 0
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], y); // blocks: 2,2,2,2, 3,3,3,3, row 1
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], y); // blocks: 4,4,4,4, 5,5,5,5, row 2
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], y); // blocks: 6,6,6,6, 7,7,7,7, row 3
|
||||
auto s12 = _mm256_packs_epi32(sumi1, sumi2); // 0,0,0,0, 2,2,2,2, 1,1,1,1, 3,3,3,3
|
||||
auto s34 = _mm256_packs_epi32(sumi3, sumi4); // 4,4,4,4, 6,6,6,6, 5,5,5,5, 7,7,7,7
|
||||
isum[2*iy+0] = _mm256_add_epi32(isum[2*iy+0], _mm256_madd_epi16(scs[0], s12));
|
||||
isum[2*iy+1] = _mm256_add_epi32(isum[2*iy+1], _mm256_madd_epi16(scs[1], s34));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[0] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[0], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[1] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[1], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
qx[2] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[2], s));
|
||||
s = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
qx[3] = _mm256_add_epi8(_mm256_set1_epi8(64), _mm256_sign_epi8(qx[3], s));
|
||||
__m256i scs[4] = {
|
||||
_mm256_shuffle_epi8(scales16, shuffles[0]), _mm256_shuffle_epi8(scales16, shuffles[1]),
|
||||
_mm256_shuffle_epi8(scales16, shuffles[2]), _mm256_shuffle_epi8(scales16, shuffles[3]),
|
||||
};
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(scs[0], _mm256_maddubs_epi16(qx[0], y)); // blocks 4x0, 4x1, row 0
|
||||
auto sumi2 = _mm256_madd_epi16(scs[1], _mm256_maddubs_epi16(qx[1], y)); // blocks 4x2, 4x3, row 1
|
||||
auto sumi3 = _mm256_madd_epi16(scs[2], _mm256_maddubs_epi16(qx[2], y)); // blocks 4x4, 4x5, row 2
|
||||
auto sumi4 = _mm256_madd_epi16(scs[3], _mm256_maddubs_epi16(qx[3], y)); // blocks 4x6, 4x7, row 3
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], sumi);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto sumi = _mm256_hadd_epi32(isum[2*iy+0], isum[2*iy+1]);
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(sumi), acc[iy]);
|
||||
isum[2*iy+0] = isum[2*iy+1] = _mm256_setzero_si256();
|
||||
#else
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, _mm_mul_ps(_mm_set1_ps(0.125f), sum));
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq3_xxs_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto smask = _mm256_set1_epi64x(0x8040201008040201);
|
||||
auto sign_shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
auto m4 = _mm256_set1_epi8(4);
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq3 = (const block_iq3_xxs_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_mul_ps(_mm_set1_ps(0.25f), _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq3[ibl].d))); // TODO: absorb the 0.25 factor into d when quantizing/repacking
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
qx[0] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+ 7]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 6]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 5]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 4]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+ 3]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 2]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 1]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 0]]);
|
||||
qx[1] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+15]], iq3xxs_grid[iq3[ibl].qs[32*ib+14]], iq3xxs_grid[iq3[ibl].qs[32*ib+13]], iq3xxs_grid[iq3[ibl].qs[32*ib+12]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+11]], iq3xxs_grid[iq3[ibl].qs[32*ib+10]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 9]], iq3xxs_grid[iq3[ibl].qs[32*ib+ 8]]);
|
||||
qx[2] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+23]], iq3xxs_grid[iq3[ibl].qs[32*ib+22]], iq3xxs_grid[iq3[ibl].qs[32*ib+21]], iq3xxs_grid[iq3[ibl].qs[32*ib+20]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+19]], iq3xxs_grid[iq3[ibl].qs[32*ib+18]], iq3xxs_grid[iq3[ibl].qs[32*ib+17]], iq3xxs_grid[iq3[ibl].qs[32*ib+16]]);
|
||||
qx[3] = _mm256_set_epi32(iq3xxs_grid[iq3[ibl].qs[32*ib+31]], iq3xxs_grid[iq3[ibl].qs[32*ib+30]], iq3xxs_grid[iq3[ibl].qs[32*ib+29]], iq3xxs_grid[iq3[ibl].qs[32*ib+28]],
|
||||
iq3xxs_grid[iq3[ibl].qs[32*ib+27]], iq3xxs_grid[iq3[ibl].qs[32*ib+26]], iq3xxs_grid[iq3[ibl].qs[32*ib+25]], iq3xxs_grid[iq3[ibl].qs[32*ib+24]]);
|
||||
auto sas = _mm_loadu_si128((const __m128i *)iq3[ibl].sas + ib);
|
||||
auto scales = _mm_and_si128(sas, _mm_set1_epi8(1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
scales = _mm_dpbusd_epi32(_mm_set1_epi32(1), scales, _mm_set1_epi32(0x10080402));
|
||||
#else
|
||||
scales = _mm_maddubs_epi16(scales, _mm_set1_epi32(0x10080402));
|
||||
scales = _mm_add_epi32(_mm_madd_epi16(_mm_set1_epi16(1), scales), _mm_set1_epi32(1));
|
||||
//auto t1 = _mm_or_si128(_mm_and_si128(scales, _mm_set1_epi32(0x00000001)), _mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x00000100)), 7));
|
||||
//auto t2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x00010000)), 14), _mm_srli_epi32(_mm_and_si128(scales, _mm_set1_epi32(0x01000000)), 21));
|
||||
//scales = _mm_or_si128(_mm_slli_epi32(_mm_or_si128(t1, t2), 1), _mm_set1_epi32(1));
|
||||
#endif
|
||||
auto scales32 = MM256_SET_M128I(scales, scales);
|
||||
auto signs128 = _mm_and_si128(sas, _mm_set1_epi8(-2)); // 0xfe = -2 as signed. Needed to shutup compiler warning.
|
||||
signs128 = _mm_xor_si128(signs128, _mm_srli_epi16(signs128, 1));
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto mask = (const __mmask32 *)&signs128;
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[0], _mm256_mask_sub_epi8(y, mask[0], _mm256_setzero_si256(), y));
|
||||
auto sumi2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[1], _mm256_mask_sub_epi8(y, mask[1], _mm256_setzero_si256(), y));
|
||||
auto sumi3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2], _mm256_mask_sub_epi8(y, mask[2], _mm256_setzero_si256(), y));
|
||||
auto sumi4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[3], _mm256_mask_sub_epi8(y, mask[3], _mm256_setzero_si256(), y));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#else
|
||||
auto signs = MM256_SET_M128I(signs128, signs128);
|
||||
auto shuffle = sign_shuffle;
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
shuffle = _mm256_add_epi8(shuffle, m4);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(signs, shuffle), smask), smask), _mm256_set1_epi8(1));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(y, s1)));
|
||||
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(y, s2)));
|
||||
auto sumi3 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(y, s3)));
|
||||
auto sumi4 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(y, s4)));
|
||||
auto s12 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi1, sumi2), _mm256_unpackhi_epi32(sumi1, sumi2)); // 0,1, 0,1, 0,1, 0,1
|
||||
auto s34 = _mm256_add_epi32(_mm256_unpacklo_epi32(sumi3, sumi4), _mm256_unpackhi_epi32(sumi3, sumi4)); // 2,3, 2,3, 2,3, 2,3
|
||||
auto sumi = _mm256_add_epi32(_mm256_unpacklo_epi64(s12, s34), _mm256_unpackhi_epi64(s12, s34)); // 0,1,2,3, 0,1,2,3
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(scales32, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, sum);
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
// Strangely enough, the following implementation makes PP ~6% slower and TG ~6% faster
|
||||
// compared to the vanilla AVX2 version below.
|
||||
struct IndexHelperIQ3S {
|
||||
union index_t {
|
||||
__m256i vec;
|
||||
uint16_t val[16];
|
||||
};
|
||||
inline void make2(const uint8_t * qs, const uint8_t * qh, __m256i * values) const {
|
||||
auto idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs));
|
||||
const __mmask16 * m16 = (const __mmask16 *)qh;
|
||||
index_t idx;
|
||||
idx.vec = _mm256_mask_add_epi16(idx_l, m16[0], idx_l, offset);
|
||||
values[0] = _mm256_set_epi32(iq3s_grid[idx.val[ 7]], iq3s_grid[idx.val[ 6]], iq3s_grid[idx.val[ 5]], iq3s_grid[idx.val[ 4]],
|
||||
iq3s_grid[idx.val[ 3]], iq3s_grid[idx.val[ 2]], iq3s_grid[idx.val[ 1]], iq3s_grid[idx.val[ 0]]);
|
||||
values[1] = _mm256_set_epi32(iq3s_grid[idx.val[15]], iq3s_grid[idx.val[14]], iq3s_grid[idx.val[13]], iq3s_grid[idx.val[12]],
|
||||
iq3s_grid[idx.val[11]], iq3s_grid[idx.val[10]], iq3s_grid[idx.val[ 9]], iq3s_grid[idx.val[ 8]]);
|
||||
}
|
||||
const __m256i offset = _mm256_set1_epi16(256);
|
||||
};
|
||||
#else
|
||||
struct IndexHelperIQ3S {
|
||||
union index_t {
|
||||
__m256i vec;
|
||||
uint32_t val[8];
|
||||
};
|
||||
inline void make2(const uint8_t * qs, const uint8_t * qh, __m256i * values) const {
|
||||
index_t idx;
|
||||
auto idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)qs));
|
||||
auto idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[0]), idx_shift), idx_mask);
|
||||
idx.vec = _mm256_or_si256(idx_h, idx_l);
|
||||
values[0] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]],
|
||||
iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]);
|
||||
idx_l = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(qs+8)));
|
||||
idx_h = _mm256_and_si256(_mm256_sllv_epi32(_mm256_set1_epi32(qh[1]), idx_shift), idx_mask);
|
||||
idx.vec = _mm256_or_si256(idx_h, idx_l);
|
||||
values[1] = _mm256_set_epi32(iq3s_grid[idx.val[7]], iq3s_grid[idx.val[6]], iq3s_grid[idx.val[5]], iq3s_grid[idx.val[4]],
|
||||
iq3s_grid[idx.val[3]], iq3s_grid[idx.val[2]], iq3s_grid[idx.val[1]], iq3s_grid[idx.val[0]]);
|
||||
}
|
||||
const __m256i idx_mask = _mm256_set1_epi32(256);
|
||||
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
||||
};
|
||||
#endif
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_iq3_s_r4_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
auto smask = _mm256_set1_epi8(1);
|
||||
union { __m256i vec; uint32_t val[8]; } helper;
|
||||
union { __m128i vec; uint16_t val[8]; } hidx;
|
||||
__m256 acc[nrc_y] = {};
|
||||
__m256i isum[nrc_y] = {};
|
||||
__m256i qx[4];
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
__mmask32 mask[4];
|
||||
#endif
|
||||
for (int ix = 0; ix < nrc_x; ix += 4) {
|
||||
auto iq3 = (const block_iq3_s_r4 *)((const char *)vx + (ix+0)*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto dl = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)iq3[ibl].d));
|
||||
auto d4 = _mm256_set_m128(dl, dl);
|
||||
auto qs = iq3[ibl].qs;
|
||||
auto qh = iq3[ibl].qh;
|
||||
auto scale_bits = _mm_loadu_si128((const __m128i *)iq3[ibl].scales);
|
||||
auto scales8 = MM256_SET_M128I(_mm_srli_epi16(scale_bits, 4), scale_bits);
|
||||
helper.vec = _mm256_or_si256(_mm256_slli_epi16(_mm256_and_si256(scales8, _mm256_set1_epi8(0xf)), 1), _mm256_set1_epi8(1));
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
auto qh32 = (const uint32_t *)qh;
|
||||
auto idx_h = _mm_sllv_epi64(_mm_cvtepu8_epi16(_mm_set1_epi32(qh32[0])), _mm_set_epi64x(4, 8));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
auto idx_l = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i *)(qs + 8*i)));
|
||||
hidx.vec = _mm_or_si128(idx_l, _mm_and_si128(idx_h, _mm_set1_epi16(0x100))); idx_h = _mm_srli_epi16(idx_h, 1);
|
||||
qx[i] = _mm256_set_epi32(iq3s_grid[hidx.val[7]], iq3s_grid[hidx.val[6]], iq3s_grid[hidx.val[5]], iq3s_grid[hidx.val[4]],
|
||||
iq3s_grid[hidx.val[3]], iq3s_grid[hidx.val[2]], iq3s_grid[hidx.val[1]], iq3s_grid[hidx.val[0]]);
|
||||
}
|
||||
qs += 32; qh += 4;
|
||||
auto signs128 = _mm_loadu_si128((const __m128i*)iq3[ibl].signs + ib);
|
||||
auto signs = MM256_SET_M128I(_mm_srli_epi16(signs128, 4), signs128);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto scales = _mm256_cvtepi8_epi32(_mm_set1_epi32(helper.val[ib]));
|
||||
mask[0] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[1] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[2] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
mask[3] = _mm256_cmpeq_epi8_mask(_mm256_and_si256(signs, smask), smask);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
auto ys = _mm256_shuffle_epi32(y, 0x00);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[0], _mm256_mask_sub_epi8(ys, mask[0], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0x55);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[1], _mm256_mask_sub_epi8(ys, mask[1], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0xaa);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[2], _mm256_mask_sub_epi8(ys, mask[2], _mm256_setzero_si256(), ys));
|
||||
ys = _mm256_shuffle_epi32(y, 0xff);
|
||||
sumi = _mm256_dpbusd_epi32(sumi, qx[3], _mm256_mask_sub_epi8(ys, mask[3], _mm256_setzero_si256(), ys));
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_mullo_epi32(sumi, scales));
|
||||
}
|
||||
#else
|
||||
auto scales16 = _mm256_cvtepi8_epi16(_mm_set1_epi32(helper.val[ib]));
|
||||
auto scales = _mm256_unpacklo_epi16(scales16, scales16);
|
||||
auto s1 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s2 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s3 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask); signs = _mm256_srli_epi16(signs, 1);
|
||||
auto s4 = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(signs, smask), smask), smask);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y = _mm256_loadu_si256((const __m256i *)q8.y[iy][ibl].qs + ib);
|
||||
auto sumi = _mm256_setzero_si256();
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[0], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), s1)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[1], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), s2)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[2], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), s3)));
|
||||
sumi = _mm256_add_epi16(sumi, _mm256_maddubs_epi16(qx[3], _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), s4)));
|
||||
isum[iy] = _mm256_add_epi32(isum[iy], _mm256_madd_epi16(scales, sumi));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
acc[iy] = _mm256_fmadd_ps(_mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl))), _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto sum = _mm_add_ps(_mm256_castps256_ps128(acc[iy]), _mm256_extractf128_ps(acc[iy], 1));
|
||||
info.store(ix, iy, sum);
|
||||
acc[iy] = _mm256_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The HAVE_FANCY_SIMD should only be #if defined(__AVX512_VNNI__ && defined(__AVX512VL__)
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_KV_r8_q8_KV(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
@@ -3136,7 +2275,12 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return ggml_type(typeB) == GGML_TYPE_Q8_K ? iqk_set_kernels_iquants(ne00, typeA, typeB, mm.funcs) : false;
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
case GGML_TYPE_IQ2_XS_R4:
|
||||
case GGML_TYPE_IQ2_S_R4:
|
||||
case GGML_TYPE_IQ3_XXS_R4:
|
||||
case GGML_TYPE_IQ3_S_R4:
|
||||
return ggml_type(typeB) == GGML_TYPE_Q8_K ? iqk_set_kernels_iquants(ne00, typeA, typeB, mm.funcs, mm.func16) : false;
|
||||
case GGML_TYPE_IQ4_KS:
|
||||
case GGML_TYPE_IQ5_KS:
|
||||
case GGML_TYPE_IQ4_KSS:
|
||||
@@ -3192,74 +2336,6 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
#endif
|
||||
expected_typeB = GGML_TYPE_Q8_K32;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
mm.funcs[0] = mul_mat_iq2_xxs_r4_q8_k<1>;
|
||||
mm.funcs[1] = mul_mat_iq2_xxs_r4_q8_k<2>;
|
||||
mm.funcs[2] = mul_mat_iq2_xxs_r4_q8_k<3>;
|
||||
mm.funcs[3] = mul_mat_iq2_xxs_r4_q8_k<4>;
|
||||
mm.funcs[4] = mul_mat_iq2_xxs_r4_q8_k<5>;
|
||||
mm.funcs[5] = mul_mat_iq2_xxs_r4_q8_k<6>;
|
||||
mm.funcs[6] = mul_mat_iq2_xxs_r4_q8_k<7>;
|
||||
mm.funcs[7] = mul_mat_iq2_xxs_r4_q8_k<8>;
|
||||
mm.func16 = mul_mat_iq2_xxs_r4_q8_k<16>;
|
||||
expected_typeB = GGML_TYPE_Q8_K;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
mm.funcs[0] = mul_mat_iq2_xs_r4_q8_k<1>;
|
||||
mm.funcs[1] = mul_mat_iq2_xs_r4_q8_k<2>;
|
||||
mm.funcs[2] = mul_mat_iq2_xs_r4_q8_k<3>;
|
||||
mm.funcs[3] = mul_mat_iq2_xs_r4_q8_k<4>;
|
||||
mm.funcs[4] = mul_mat_iq2_xs_r4_q8_k<5>;
|
||||
mm.funcs[5] = mul_mat_iq2_xs_r4_q8_k<6>;
|
||||
mm.funcs[6] = mul_mat_iq2_xs_r4_q8_k<7>;
|
||||
mm.funcs[7] = mul_mat_iq2_xs_r4_q8_k<8>;
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
// For some reason Zen4 does not like this particular function
|
||||
mm.func16 = mul_mat_iq2_xs_r4_q8_k_16;
|
||||
#endif
|
||||
expected_typeB = GGML_TYPE_Q8_K;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
mm.funcs[0] = mul_mat_iq2_s_r4_q8_k<1>;
|
||||
mm.funcs[1] = mul_mat_iq2_s_r4_q8_k<2>;
|
||||
mm.funcs[2] = mul_mat_iq2_s_r4_q8_k<3>;
|
||||
mm.funcs[3] = mul_mat_iq2_s_r4_q8_k<4>;
|
||||
mm.funcs[4] = mul_mat_iq2_s_r4_q8_k<5>;
|
||||
mm.funcs[5] = mul_mat_iq2_s_r4_q8_k<6>;
|
||||
mm.funcs[6] = mul_mat_iq2_s_r4_q8_k<7>;
|
||||
mm.funcs[7] = mul_mat_iq2_s_r4_q8_k<8>;
|
||||
mm.func16 = mul_mat_iq2_s_r4_q8_k_16;
|
||||
expected_typeB = GGML_TYPE_Q8_K;
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
mm.funcs[0] = mul_mat_iq3_xxs_r4_q8_k<1>;
|
||||
mm.funcs[1] = mul_mat_iq3_xxs_r4_q8_k<2>;
|
||||
mm.funcs[2] = mul_mat_iq3_xxs_r4_q8_k<3>;
|
||||
mm.funcs[3] = mul_mat_iq3_xxs_r4_q8_k<4>;
|
||||
mm.funcs[4] = mul_mat_iq3_xxs_r4_q8_k<5>;
|
||||
mm.funcs[5] = mul_mat_iq3_xxs_r4_q8_k<6>;
|
||||
mm.funcs[6] = mul_mat_iq3_xxs_r4_q8_k<7>;
|
||||
mm.funcs[7] = mul_mat_iq3_xxs_r4_q8_k<8>;
|
||||
mm.func16 = mul_mat_iq3_xxs_r4_q8_k<16>;
|
||||
expected_typeB = GGML_TYPE_Q8_K;
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S_R4:
|
||||
assert (ne00 % QK_K == 0);
|
||||
mm.funcs[0] = mul_mat_iq3_s_r4_q8_k<1>;
|
||||
mm.funcs[1] = mul_mat_iq3_s_r4_q8_k<2>;
|
||||
mm.funcs[2] = mul_mat_iq3_s_r4_q8_k<3>;
|
||||
mm.funcs[3] = mul_mat_iq3_s_r4_q8_k<4>;
|
||||
mm.funcs[4] = mul_mat_iq3_s_r4_q8_k<5>;
|
||||
mm.funcs[5] = mul_mat_iq3_s_r4_q8_k<6>;
|
||||
mm.funcs[6] = mul_mat_iq3_s_r4_q8_k<7>;
|
||||
mm.funcs[7] = mul_mat_iq3_s_r4_q8_k<8>;
|
||||
mm.func16 = mul_mat_iq3_s_r4_q8_k<16>;
|
||||
expected_typeB = GGML_TYPE_Q8_K;
|
||||
break;
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
assert (ne00 % 32 == 0);
|
||||
mm.funcs[0] = mul_mat_q8_KV_r8_q8_KV<1>;
|
||||
|
||||
Reference in New Issue
Block a user