mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-28 09:04:10 +00:00
iqk_mul_mat: improve iq1_bn (bitnet) on vanilla AVX2
I now get PP-512 = 270 t/s on the Ryzen-5975WX
This commit is contained in:
@@ -1383,9 +1383,7 @@ static void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const DataIn
|
||||
#if defined __AVX512VNNI__ && defined __AVX512VL__
|
||||
auto dot = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), m1_8, dot1), m1_8, dot2);
|
||||
#else
|
||||
dot1 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot1));
|
||||
dot2 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot2));
|
||||
auto dot = _mm256_add_epi32(_mm256_add_epi32(dot1, dot2));
|
||||
auto dot = _mm256_madd_epi16(m1_16, _mm256_add_epi16(_mm256_maddubs_epi16(m1_8, dot1), _mm256_maddubs_epi16(m1_8, dot2)));
|
||||
#endif
|
||||
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(q8.scale(iy, i)), _mm256_cvtepi32_ps(dot), accd[iy]);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user