mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
bitnet: remove iq1_bn lookup table storing +/- signs
The AVX2 implementation was the only one left using it, so I decided to see if we can get a performant implementation using the 0,1,2 lookup table. Turns out we can, and it is even slightly faster than the sign based table. We now get PP-512 = 275 t/s and TG-128 = 57.7 t/s with 16 threads on the Ryzen-7950X. With only one lookup table left for iq1_bn, I renamed it to iq1bn_grid_u16.
This commit is contained in:
@@ -1342,33 +1342,35 @@ template <int nrc> struct Q8_K64 {
|
||||
|
||||
struct DequantizerIQ1BN {
|
||||
const __m256i m1_8 = _mm256_set1_epi8(1);
|
||||
const __m256i shuff1 = _mm256_set_epi64x(0x0808080808080808, 0x0000000000000000, 0x0808080808080808, 0x0000000000000000);
|
||||
const __m256i shuff2 = _mm256_add_epi8(shuff1, m1_8);
|
||||
const __m256i shuff1 = _mm256_set_epi64x(0x0908090809080908, 0x0100010001000100, 0x0908090809080908, 0x0100010001000100);
|
||||
#if defined __AVX512F__ && defined __AVX512VL__
|
||||
const __m256i minus1 = _mm256_set1_epi64x(0xffff);
|
||||
const __m256i minus1 = _mm256_set1_epi64x(0xaaaa);
|
||||
const __m256i shifts = _mm256_set1_epi64x(0x0006000400020000);
|
||||
#else
|
||||
const __m256i shuff3 = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
|
||||
const __m256i shuff4 = _mm256_set_epi64x(0x0707070707070707, 0x0606060606060606, 0x0505050505050505, 0x0404040404040404);
|
||||
#endif
|
||||
const __m256i mask1 = _mm256_set1_epi64x(0x8040201008040201);
|
||||
#endif
|
||||
const __m256i qmask = _mm256_set1_epi8(0x03);
|
||||
|
||||
IQK_ALWAYS_INLINE void prepare_iq1bn_quants(uint8_t extra, const uint8_t * ql, const uint8_t * qh, __m256i& v1, __m256i& v2) {
|
||||
|
||||
auto aux1 = _mm256_set_epi64x(iq1bn_grid_xxx[ql[3] | ((qh[1] << 4) & 0x0f00)], iq1bn_grid_xxx[ql[2] | ((qh[1] << 8) & 0x0f00)],
|
||||
iq1bn_grid_xxx[ql[1] | ((qh[0] << 4) & 0x0f00)], iq1bn_grid_xxx[ql[0] | ((qh[0] << 8) & 0x0f00)]);
|
||||
auto aux2 = _mm256_set_epi64x(iq1bn_grid_xxx[ql[7] | ((qh[3] << 4) & 0x0f00)], iq1bn_grid_xxx[ql[6] | ((qh[3] << 8) & 0x0f00)],
|
||||
iq1bn_grid_xxx[ql[5] | ((qh[2] << 4) & 0x0f00)], iq1bn_grid_xxx[ql[4] | ((qh[2] << 8) & 0x0f00)]);
|
||||
auto aux1 = _mm256_set_epi64x(iq1bn_grid_u16[ql[3] | ((qh[1] << 4) & 0x0f00)], iq1bn_grid_u16[ql[2] | ((qh[1] << 8) & 0x0f00)],
|
||||
iq1bn_grid_u16[ql[1] | ((qh[0] << 4) & 0x0f00)], iq1bn_grid_u16[ql[0] | ((qh[0] << 8) & 0x0f00)]);
|
||||
auto aux2 = _mm256_set_epi64x(iq1bn_grid_u16[ql[7] | ((qh[3] << 4) & 0x0f00)], iq1bn_grid_u16[ql[6] | ((qh[3] << 8) & 0x0f00)],
|
||||
iq1bn_grid_u16[ql[5] | ((qh[2] << 4) & 0x0f00)], iq1bn_grid_u16[ql[4] | ((qh[2] << 8) & 0x0f00)]);
|
||||
#if defined __AVX512F__ && defined __AVX512VL__
|
||||
aux1 = _mm256_mask_sub_epi64(aux1, extra & 0xf, minus1, aux1);
|
||||
aux2 = _mm256_mask_sub_epi64(aux2, extra >> 4, minus1, aux2);
|
||||
#endif
|
||||
|
||||
v1 = _mm256_sub_epi8(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(aux1, shuff2), mask1), mask1),
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(aux1, shuff1), mask1), mask1));
|
||||
v2 = _mm256_sub_epi8(_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(aux2, shuff2), mask1), mask1),
|
||||
_mm256_cmpeq_epi8(_mm256_and_si256(_mm256_shuffle_epi8(aux2, shuff1), mask1), mask1));
|
||||
|
||||
#if !(defined __AVX512F__ && defined __AVX512VL__)
|
||||
v1 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srlv_epi16(_mm256_shuffle_epi8(aux1, shuff1), shifts), qmask), m1_8);
|
||||
v2 = _mm256_sub_epi8(_mm256_and_si256(_mm256_srlv_epi16(_mm256_shuffle_epi8(aux2, shuff1), shifts), qmask), m1_8);
|
||||
#else
|
||||
aux1 = _mm256_or_si256(aux1, _mm256_slli_epi64(aux1, 14));
|
||||
aux2 = _mm256_or_si256(aux2, _mm256_slli_epi64(aux2, 14));
|
||||
aux1 = _mm256_or_si256(aux1, _mm256_slli_epi64(aux1, 28));
|
||||
aux2 = _mm256_or_si256(aux2, _mm256_slli_epi64(aux2, 28));
|
||||
v1 = _mm256_sub_epi8(_mm256_and_si256(aux1, qmask), m1_8);
|
||||
v2 = _mm256_sub_epi8(_mm256_and_si256(aux2, qmask), m1_8);
|
||||
auto all_signs = _mm256_set1_epi8(extra);
|
||||
all_signs = _mm256_or_si256(_mm256_cmpeq_epi8(_mm256_and_si256(all_signs, mask1), mask1), m1_8);
|
||||
v1 = _mm256_sign_epi8(v1, _mm256_shuffle_epi8(all_signs, shuff3));
|
||||
@@ -4373,10 +4375,10 @@ struct DequantizerIQ1BN {
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
const uint8_t * h = (const uint8_t *)aux32;
|
||||
|
||||
a.val[0] = uint64x2_t{iq1bn_grid_zzz[ql[0] | (h[0] << 8)], iq1bn_grid_zzz[ql[1] | (h[4] << 4)]};
|
||||
a.val[1] = uint64x2_t{iq1bn_grid_zzz[ql[2] | (h[1] << 8)], iq1bn_grid_zzz[ql[3] | (h[5] << 4)]};
|
||||
a.val[2] = uint64x2_t{iq1bn_grid_zzz[ql[4] | (h[2] << 8)], iq1bn_grid_zzz[ql[5] | (h[6] << 4)]};
|
||||
a.val[3] = uint64x2_t{iq1bn_grid_zzz[ql[6] | (h[3] << 8)], iq1bn_grid_zzz[ql[7] | (h[7] << 4)]};
|
||||
a.val[0] = uint64x2_t{iq1bn_grid_u16[ql[0] | (h[0] << 8)], iq1bn_grid_u16[ql[1] | (h[4] << 4)]};
|
||||
a.val[1] = uint64x2_t{iq1bn_grid_u16[ql[2] | (h[1] << 8)], iq1bn_grid_u16[ql[3] | (h[5] << 4)]};
|
||||
a.val[2] = uint64x2_t{iq1bn_grid_u16[ql[4] | (h[2] << 8)], iq1bn_grid_u16[ql[5] | (h[6] << 4)]};
|
||||
a.val[3] = uint64x2_t{iq1bn_grid_u16[ql[6] | (h[3] << 8)], iq1bn_grid_u16[ql[7] | (h[7] << 4)]};
|
||||
|
||||
v.val[0] = vsubq_s8(vandq_u8(vshlq_u16(vqtbl1q_u8(vreinterpretq_u8_u64(a.val[0]), shuff1), shift), qmask), m1);
|
||||
v.val[1] = vsubq_s8(vandq_u8(vshlq_u16(vqtbl1q_u8(vreinterpretq_u8_u64(a.val[1]), shuff1), shift), qmask), m1);
|
||||
|
||||
Reference in New Issue
Block a user