Peformance fixes for gcc compiler in fringe kernels

Description:
1. GCC avoiding loading b into registers in m fringe
   kenrels of int8 kernels. Instead gcc generating
   fma with memory as an operand for B input.

2. This is causing performance regression for larger n
   where each fma needs to load the input from memory
   again and again.

3. This is observed with gcc but not with clang.

4. Inserted dummy shuffle instructions for b data to
   further explicitly tell compiler that b needs to be in
   registers.

   AMD-Internal: SWLCSG-2948

Change-Id: Ibbf186fe6569e6265e2c2bb4ec3141ef323ea3e6
This commit is contained in:
Nallani Bhaskar
2024-08-05 18:28:01 +00:00
parent 09c45525f4
commit e712673ab7
3 changed files with 95 additions and 10 deletions

View File

@@ -874,10 +874,13 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x64)
/*Load 32 elements from row0 of B*/
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
/*Load Next 32 elements from row0 of B*/
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
/*Broadcast col0 elements of 12 rows of A*/
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
@@ -1537,9 +1540,14 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x64)
/*Load 32 elements from row0 of B*/
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
/*Load Next 32 elements from row0 of B*/
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
/*Broadcast col0 elements of 12 rows of A*/
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
@@ -2069,11 +2077,15 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x64)
/*Load 32 elements from row0 of B*/
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
/*Load Next 32 elements from row0 of B*/
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
/*Broadcast col0 elements of 12 rows of A*/
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1

View File

@@ -1243,18 +1243,26 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64)
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
//convert signed int8 to uint8 for VNNI
b0 = _mm512_shuffle_epi8(b0, dsmask);
// convert signed int8 to uint8 for VNNI
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
b1 = _mm512_shuffle_epi8(b1, dsmask);
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
b2 = _mm512_shuffle_epi8(b2, dsmask);
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
b3 = _mm512_shuffle_epi8(b3, dsmask);
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
@@ -2205,9 +2213,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
__m512i c_int32_2p2 = _mm512_setzero_epi32();
__m512i c_int32_2p3 = _mm512_setzero_epi32();
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
b0 = _mm512_shuffle_epi8( b0, dsmask );
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
@@ -2215,8 +2234,12 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
b1 = _mm512_shuffle_epi8( b1, dsmask );
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
b2 = _mm512_shuffle_epi8( b2, dsmask );
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
b3 = _mm512_shuffle_epi8( b3, dsmask );
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
@@ -2985,10 +3008,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
__m512i c_int32_1p1 = _mm512_setzero_epi32();
__m512i c_int32_1p2 = _mm512_setzero_epi32();
__m512i c_int32_1p3 = _mm512_setzero_epi32();
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
b0 = _mm512_shuffle_epi8( b0, dsmask);
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
@@ -2996,8 +3029,11 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
b1 = _mm512_shuffle_epi8( b1, dsmask );
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
b2 = _mm512_shuffle_epi8( b2, dsmask );
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
b3 = _mm512_shuffle_epi8( b3, dsmask );
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]

View File

@@ -1168,15 +1168,25 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64)
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
b0 = _mm512_shuffle_epi8(b0, dsmask);
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
b1 = _mm512_shuffle_epi8(b1, dsmask);
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
b2 = _mm512_shuffle_epi8(b2, dsmask);
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
b3 = _mm512_shuffle_epi8(b3, dsmask);
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
@@ -2065,15 +2075,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64)
__m512i c_int32_2p2 = _mm512_setzero_epi32();
__m512i c_int32_2p3 = _mm512_setzero_epi32();
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
b0 = _mm512_shuffle_epi8(b0, dsmask);
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
b1 = _mm512_shuffle_epi8(b1, dsmask);
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
b2 = _mm512_shuffle_epi8(b2, dsmask);
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
b3 = _mm512_shuffle_epi8(b3, dsmask);
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
@@ -2794,16 +2818,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64)
__m512i c_int32_1p2 = _mm512_setzero_epi32();
__m512i c_int32_1p3 = _mm512_setzero_epi32();
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
// registers while generating the code. A dummy shuffle instruction
// is used on b data to explicitly specify to gcc compiler
// b data needs to be kept in registers to reuse across FMA's
__m512i dsmask = _mm512_set_epi64(
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100,
0x0F0E0D0C0B0A0908, 0x0706050403020100);
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
{
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
b0 = _mm512_shuffle_epi8(b0, dsmask);
// Broadcast a[0,kr:kr+4].
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
b1 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 1));
b2 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 2));
b3 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 3));
b1 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 1));
b1 = _mm512_shuffle_epi8( b1, dsmask);
b2 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 2));
b2 = _mm512_shuffle_epi8( b2, dsmask);
b3 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 3));
b3 = _mm512_shuffle_epi8( b3, dsmask);
// Perform column direction mat-mul with k = 4.
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]