From dfdc4dbee6c5c872c8b681e9cfe9420bc390cf56 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 17 Jun 2024 08:24:51 +0300 Subject: [PATCH] iqk_mul_mat: improve iq1_bn (bitnet) on vanilla AVX2 I now get PP-512 = 270 t/s on the Ryzen-5975WX --- iqk_mul_mat.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/iqk_mul_mat.cpp b/iqk_mul_mat.cpp index c204b22c..f38163d5 100644 --- a/iqk_mul_mat.cpp +++ b/iqk_mul_mat.cpp @@ -1383,9 +1383,7 @@ static void mul_mat_iq1bn_q8_K64(int n, const void * vx, size_t bx, const DataIn #if defined __AVX512VNNI__ && defined __AVX512VL__ auto dot = _mm256_dpbusd_epi32(_mm256_dpbusd_epi32(_mm256_setzero_si256(), m1_8, dot1), m1_8, dot2); #else - dot1 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot1)); - dot2 = _mm256_madd_epi16(m1_16, _mm256_maddubs_epi16(m1_8, dot2)); - auto dot = _mm256_add_epi32(_mm256_add_epi32(dot1, dot2)); + auto dot = _mm256_madd_epi16(m1_16, _mm256_add_epi16(_mm256_maddubs_epi16(m1_8, dot1), _mm256_maddubs_epi16(m1_8, dot2))); #endif accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(q8.scale(iy, i)), _mm256_cvtepi32_ps(dot), accd[iy]); }