mirror of
https://github.com/amd/blis.git
synced 2026-03-14 22:37:23 +00:00
Peformance fixes for gcc compiler in fringe kernels
Description: 1. GCC avoiding loading b into registers in m fringe kenrels of int8 kernels. Instead gcc generating fma with memory as an operand for B input. 2. This is causing performance regression for larger n where each fma needs to load the input from memory again and again. 3. This is observed with gcc but not with clang. 4. Inserted dummy shuffle instructions for b data to further explicitly tell compiler that b needs to be in registers. AMD-Internal: SWLCSG-2948 Change-Id: Ibbf186fe6569e6265e2c2bb4ec3141ef323ea3e6
This commit is contained in:
@@ -874,10 +874,13 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x64)
|
||||
/*Load 32 elements from row0 of B*/
|
||||
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
|
||||
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
|
||||
|
||||
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
|
||||
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
|
||||
/*Load Next 32 elements from row0 of B*/
|
||||
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
|
||||
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
|
||||
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
|
||||
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
|
||||
|
||||
/*Broadcast col0 elements of 12 rows of A*/
|
||||
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
|
||||
@@ -1537,9 +1540,14 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x64)
|
||||
/*Load 32 elements from row0 of B*/
|
||||
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
|
||||
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
|
||||
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
|
||||
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
|
||||
|
||||
/*Load Next 32 elements from row0 of B*/
|
||||
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
|
||||
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
|
||||
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
|
||||
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
|
||||
|
||||
/*Broadcast col0 elements of 12 rows of A*/
|
||||
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
|
||||
@@ -2069,11 +2077,15 @@ LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x64)
|
||||
/*Load 32 elements from row0 of B*/
|
||||
zmm0 = _mm512_loadu_ps (bbuf ); //load 0-15 values from current row
|
||||
zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
|
||||
zmm0 = _mm512_shuffle_ps(zmm0, zmm0, 0xE4); // dummy shuffle
|
||||
zmm1 = _mm512_shuffle_ps(zmm1, zmm1, 0xE4); // dummy shuffle
|
||||
|
||||
/*Load Next 32 elements from row0 of B*/
|
||||
zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row
|
||||
zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
|
||||
|
||||
zmm6 = _mm512_shuffle_ps(zmm6, zmm6, 0xE4); // dummy shuffle
|
||||
zmm7 = _mm512_shuffle_ps(zmm7, zmm7, 0xE4); // dummy shuffle
|
||||
|
||||
/*Broadcast col0 elements of 12 rows of A*/
|
||||
zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
|
||||
zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1
|
||||
|
||||
@@ -1243,18 +1243,26 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64)
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
//convert signed int8 to uint8 for VNNI
|
||||
b0 = _mm512_shuffle_epi8(b0, dsmask);
|
||||
// convert signed int8 to uint8 for VNNI
|
||||
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
|
||||
b1 = _mm512_shuffle_epi8(b1, dsmask);
|
||||
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
|
||||
b2 = _mm512_shuffle_epi8(b2, dsmask);
|
||||
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
|
||||
b3 = _mm512_shuffle_epi8(b3, dsmask);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
@@ -2205,9 +2213,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
|
||||
__m512i c_int32_2p2 = _mm512_setzero_epi32();
|
||||
__m512i c_int32_2p3 = _mm512_setzero_epi32();
|
||||
|
||||
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
b0 = _mm512_shuffle_epi8( b0, dsmask );
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
@@ -2215,8 +2234,12 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
|
||||
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
|
||||
b1 = _mm512_shuffle_epi8( b1, dsmask );
|
||||
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
|
||||
b2 = _mm512_shuffle_epi8( b2, dsmask );
|
||||
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
|
||||
b3 = _mm512_shuffle_epi8( b3, dsmask );
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
|
||||
@@ -2985,10 +3008,20 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
|
||||
__m512i c_int32_1p1 = _mm512_setzero_epi32();
|
||||
__m512i c_int32_1p2 = _mm512_setzero_epi32();
|
||||
__m512i c_int32_1p3 = _mm512_setzero_epi32();
|
||||
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
b0 = _mm512_shuffle_epi8( b0, dsmask);
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
@@ -2996,8 +3029,11 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
|
||||
a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
|
||||
b1 = _mm512_shuffle_epi8( b1, dsmask );
|
||||
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
|
||||
b2 = _mm512_shuffle_epi8( b2, dsmask );
|
||||
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
|
||||
b3 = _mm512_shuffle_epi8( b3, dsmask );
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
|
||||
@@ -1168,15 +1168,25 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64)
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
|
||||
for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
b0 = _mm512_shuffle_epi8(b0, dsmask);
|
||||
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
|
||||
b1 = _mm512_shuffle_epi8(b1, dsmask);
|
||||
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
|
||||
b2 = _mm512_shuffle_epi8(b2, dsmask);
|
||||
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
|
||||
b3 = _mm512_shuffle_epi8(b3, dsmask);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
@@ -2065,15 +2075,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64)
|
||||
__m512i c_int32_2p2 = _mm512_setzero_epi32();
|
||||
__m512i c_int32_2p3 = _mm512_setzero_epi32();
|
||||
|
||||
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
b0 = _mm512_shuffle_epi8(b0, dsmask);
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
|
||||
b1 = _mm512_shuffle_epi8(b1, dsmask);
|
||||
b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
|
||||
b2 = _mm512_shuffle_epi8(b2, dsmask);
|
||||
b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
|
||||
b3 = _mm512_shuffle_epi8(b3, dsmask);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
@@ -2794,16 +2818,29 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64)
|
||||
__m512i c_int32_1p2 = _mm512_setzero_epi32();
|
||||
__m512i c_int32_1p3 = _mm512_setzero_epi32();
|
||||
|
||||
// gcc compiler (atleast 11.2 to 13.1) avoid loading B into
|
||||
// registers while generating the code. A dummy shuffle instruction
|
||||
// is used on b data to explicitly specify to gcc compiler
|
||||
// b data needs to be kept in registers to reuse across FMA's
|
||||
__m512i dsmask = _mm512_set_epi64(
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100,
|
||||
0x0F0E0D0C0B0A0908, 0x0706050403020100);
|
||||
|
||||
for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
|
||||
{
|
||||
b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
|
||||
|
||||
b0 = _mm512_shuffle_epi8(b0, dsmask);
|
||||
// Broadcast a[0,kr:kr+4].
|
||||
a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
|
||||
|
||||
b1 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 1));
|
||||
b2 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 2));
|
||||
b3 = _mm512_loadu_si512(b + (rs_b * kr) + (cs_b * 3));
|
||||
b1 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 1));
|
||||
b1 = _mm512_shuffle_epi8( b1, dsmask);
|
||||
b2 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 2));
|
||||
b2 = _mm512_shuffle_epi8( b2, dsmask);
|
||||
b3 = _mm512_loadu_si512( b + (rs_b * kr) + (cs_b * 3));
|
||||
b3 = _mm512_shuffle_epi8( b3, dsmask);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
|
||||
|
||||
Reference in New Issue
Block a user