From e712673ab7efb63ff62b46c2fa42ece1f06ec8ff Mon Sep 17 00:00:00 2001 From: Nallani Bhaskar Date: Mon, 5 Aug 2024 18:28:01 +0000 Subject: [PATCH] Peformance fixes for gcc compiler in fringe kernels Description: 1. GCC avoiding loading b into registers in m fringe kenrels of int8 kernels. Instead gcc generating fma with memory as an operand for B input. 2. This is causing performance regression for larger n where each fma needs to load the input from memory again and again. 3. This is observed with gcc but not with clang. 4. Inserted dummy shuffle instructions for b data to further explicitly tell compiler that b needs to be in registers. AMD-Internal: SWLCSG-2948 Change-Id: Ibbf186fe6569e6265e2c2bb4ec3141ef323ea3e6 --- .../f32f32f32/lpgemm_fringe_f32_avx512.c | 16 +++++- .../s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c | 40 ++++++++++++++- .../u8s8s32/lpgemm_m_fringe_amd512vnni.c | 49 ++++++++++++++++--- 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c b/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c index 75b0162e1..7f3487347 100644 --- a/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c +++ b/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c @@ -874,10 +874,13 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x64) /*Load 32 elements from row0 of B*/ zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row - + zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle + zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle /*Load Next 32 elements from row0 of B*/ zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row + zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle + zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle /*Broadcast col0 elements of 12 rows of A*/ zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0 @@ -1537,9 +1540,14 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x64) /*Load 32 elements from row0 of B*/ zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row + zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle + zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle + /*Load Next 32 elements from row0 of B*/ zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row + zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle + zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle /*Broadcast col0 elements of 12 rows of A*/ zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0 @@ -2069,11 +2077,15 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x64) /*Load 32 elements from row0 of B*/ zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row + zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle + zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle /*Load Next 32 elements from row0 of B*/ zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row - + zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle + zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle + /*Broadcast col0 elements of 12 rows of A*/ zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0 zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1 diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c index 677a5b08f..44038a229 100644 --- a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c +++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c @@ -1243,18 +1243,26 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64) // registers while generating the code. A dummy shuffle instruction // is used on b data to explicitly specify to gcc compiler // b data needs to be kept in registers to reuse across FMA's + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - - //convert signed int8 to uint8 for VNNI + b0 = _mm512_shuffle_epi8(b0, dsmask); + // convert signed int8 to uint8 for VNNI a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b1 = _mm512_shuffle_epi8(b1, dsmask); b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b2 = _mm512_shuffle_epi8(b2, dsmask); b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b3 = _mm512_shuffle_epi8(b3, dsmask); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2205,9 +2213,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) __m512i c_int32_2p2 = _mm512_setzero_epi32(); __m512i c_int32_2p3 = _mm512_setzero_epi32(); + // gcc compiler (atleast 11.2 to 13.1) avoid loading B into + // registers while generating the code. A dummy shuffle instruction + // is used on b data to explicitly specify to gcc compiler + // b data needs to be kept in registers to reuse across FMA's + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); + for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_shuffle_epi8( b0, dsmask ); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2215,8 +2234,12 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64) a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b1 = _mm512_shuffle_epi8( b1, dsmask ); b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b2 = _mm512_shuffle_epi8( b2, dsmask ); b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b3 = _mm512_shuffle_epi8( b3, dsmask ); + // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 ); @@ -2985,10 +3008,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) __m512i c_int32_1p1 = _mm512_setzero_epi32(); __m512i c_int32_1p2 = _mm512_setzero_epi32(); __m512i c_int32_1p3 = _mm512_setzero_epi32(); + // gcc compiler (atleast 11.2 to 13.1) avoid loading B into + // registers while generating the code. A dummy shuffle instruction + // is used on b data to explicitly specify to gcc compiler + // b data needs to be kept in registers to reuse across FMA's + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_shuffle_epi8( b0, dsmask); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); @@ -2996,8 +3029,11 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64) a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 ); b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b1 = _mm512_shuffle_epi8( b1, dsmask ); b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b2 = _mm512_shuffle_epi8( b2, dsmask ); b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b3 = _mm512_shuffle_epi8( b3, dsmask ); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c index d9db59640..73f2f9740 100644 --- a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c +++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c @@ -1168,15 +1168,25 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64) // registers while generating the code. A dummy shuffle instruction // is used on b data to explicitly specify to gcc compiler // b data needs to be kept in registers to reuse across FMA's - for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); + + for (dim_t kr = 0; kr < k_full_pieces; kr += 1) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - + b0 = _mm512_shuffle_epi8(b0, dsmask); b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b1 = _mm512_shuffle_epi8(b1, dsmask); b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b2 = _mm512_shuffle_epi8(b2, dsmask); b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b3 = _mm512_shuffle_epi8(b3, dsmask); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2065,15 +2075,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64) __m512i c_int32_2p2 = _mm512_setzero_epi32(); __m512i c_int32_2p3 = _mm512_setzero_epi32(); + // gcc compiler (atleast 11.2 to 13.1) avoid loading B into + // registers while generating the code. A dummy shuffle instruction + // is used on b data to explicitly specify to gcc compiler + // b data needs to be kept in registers to reuse across FMA's + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); + for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); + b0 = _mm512_shuffle_epi8(b0, dsmask); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) ); + b1 = _mm512_shuffle_epi8(b1, dsmask); b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) ); + b2 = _mm512_shuffle_epi8(b2, dsmask); b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) ); + b3 = _mm512_shuffle_epi8(b3, dsmask); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63] @@ -2794,16 +2818,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64) __m512i c_int32_1p2 = _mm512_setzero_epi32(); __m512i c_int32_1p3 = _mm512_setzero_epi32(); + // gcc compiler (atleast 11.2 to 13.1) avoid loading B into + // registers while generating the code. A dummy shuffle instruction + // is used on b data to explicitly specify to gcc compiler + // b data needs to be kept in registers to reuse across FMA's + __m512i dsmask = _mm512_set_epi64( + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100, + 0x0F0E0D0C0B0A0908, 0x0706050403020100); + for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 ) { b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) ); - + b0 = _mm512_shuffle_epi8(b0, dsmask); // Broadcast a[0,kr:kr+4]. a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) ); - b1 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 1)); - b2 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 2)); - b3 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 3)); + b1 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 1)); + b1 = _mm512_shuffle_epi8( b1, dsmask); + b2 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 2)); + b2 = _mm512_shuffle_epi8( b2, dsmask); + b3 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 3)); + b3 = _mm512_shuffle_epi8( b3, dsmask); // Perform column direction mat-mul with k = 4. // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]