mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Performance fix for FP32 GEMV (#47)
Details: - In FP32 GEMM interface, mtag_b is being set to PACK by default. This is leading to packing of B matrix even though packing is not absolutely required leading to perf regression. - Setting mtag_b to PACK only if it is absolutely necessary to pack B matrix modified check conditions before packing appropriately. AMD-Internal - [SWLCSG-3575]
This commit is contained in:
committed by
GitHub
parent
49ae7db89a
commit
8968973c2d
@@ -173,15 +173,11 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
goto err_hndl;
|
||||
}
|
||||
|
||||
// By default enable packing for B matrix. Before the 5 loop, based on
|
||||
// the input dimensions, the smart threading logic will adjust it
|
||||
// (disable/enable) accordingly.
|
||||
if ( ( is_row_major == TRUE ) && ( mtag_b == UNPACKED ) )
|
||||
if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transb ) ) && ( mtag_b == UNPACKED ) )
|
||||
{
|
||||
mtag_b = PACK;
|
||||
}
|
||||
// Inputs swapped in column major, A becomes B from kernel point of view.
|
||||
else if ( ( is_column_major == TRUE ) && ( mtag_a == UNPACKED ) )
|
||||
else if( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transa ) ) && ( mtag_a == UNPACKED ) )
|
||||
{
|
||||
mtag_a = PACK;
|
||||
}
|
||||
|
||||
@@ -154,6 +154,12 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
if (c_downscale < F32) post_ops_attr.buf_downscale = c;
|
||||
else post_ops_attr.buf_downscale = NULL;
|
||||
|
||||
// Should_pack_A/B is set either by the user through env variable
|
||||
// or by the smart threading logic based on work distribution.
|
||||
// Storage format of the matrices doesn't affect should_pack_A/B.
|
||||
bool should_pack_B = bli_rntm_pack_b( rntm );
|
||||
bool should_pack_A = bli_rntm_pack_a( rntm );
|
||||
|
||||
// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
|
||||
thrinfo_t thread_jc;
|
||||
thrinfo_t thread_ic;
|
||||
@@ -189,7 +195,7 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#endif
|
||||
// Pack B matrix if rs_b > 1
|
||||
if( ( mtag_b == PACK ) && ( rs_b != 1 ) )
|
||||
if( (should_pack_B == TRUE) || ( rs_b != 1 ) )
|
||||
{
|
||||
mem_b_size_req = sizeof( float ) * k;
|
||||
|
||||
@@ -227,7 +233,7 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
post_ops_attr.post_op_c_i = ic;
|
||||
|
||||
// To-Do: pack A case needs to be handled for AVX2 case.
|
||||
if( mtag_a == PACK && cs_a != 1 )
|
||||
if( (should_pack_A == TRUE) || ( cs_a != 1 ) )
|
||||
{
|
||||
mem_a_size_req = sizeof(float) * mc0 * k;
|
||||
lpgemm_alloc_mem_panel
|
||||
@@ -258,11 +264,11 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
&post_ops_attr
|
||||
);
|
||||
}
|
||||
if ( ( mtag_a == PACK ) && ( bli_mem_is_alloc( &mem_a ) ) )
|
||||
if ( ( (should_pack_A == TRUE) || ( cs_a != 1 ) ) && ( bli_mem_is_alloc( &mem_a ) ) )
|
||||
{
|
||||
bli_pba_release( rntm, &mem_a );
|
||||
}
|
||||
if ( ( mtag_b == PACK ) && ( bli_mem_is_alloc( &mem_b ) ) )
|
||||
if ( ( (should_pack_B == TRUE) || ( rs_b != 1 ) ) && ( bli_mem_is_alloc( &mem_b ) ) )
|
||||
{
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
@@ -294,7 +300,7 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
thread_jc.work_id = thread->tid;
|
||||
bli_thread_range_sub(&thread_jc, n, NR, FALSE, &jc_start, &jc_end);
|
||||
|
||||
if ( mtag_a == PACK )
|
||||
if ( (should_pack_A == TRUE) || ( cs_a != 1 ) )
|
||||
{
|
||||
mem_a_size_req = sizeof( float ) * k;
|
||||
|
||||
@@ -340,7 +346,7 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
rs_b_use = NR;
|
||||
cs_b_use = 1;
|
||||
}
|
||||
else if (mtag_b == PACK)
|
||||
else if ( (should_pack_B == TRUE) || ( mtag_b == PACK ) )
|
||||
{
|
||||
// nc0 needs to be a multiple of 16 since this gives maximum
|
||||
// vectorization. Packing B always results in buffers with width
|
||||
@@ -406,10 +412,15 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
} // jc loop
|
||||
|
||||
// Release pack buffers.
|
||||
if ( ( mtag_b == PACK ) && ( bli_mem_is_alloc( &mem_b ) ) )
|
||||
if ( ( (should_pack_B == TRUE) || ( mtag_b == PACK ) ) && ( bli_mem_is_alloc( &mem_b ) ) )
|
||||
{
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
|
||||
if ( ( (should_pack_A == TRUE) || ( cs_a != 1 ) ) && ( bli_mem_is_alloc( &mem_a ) ) )
|
||||
{
|
||||
bli_pba_release( rntm, &mem_a );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -472,7 +483,7 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
|
||||
siz_t mem_a_size_req = 0;
|
||||
|
||||
// Check if packing of B is required.
|
||||
bool should_pack_B = bli_rntm_pack_b( rntm ) || ( rs_b == 1 );
|
||||
bool should_pack_B = bli_rntm_pack_b( rntm );
|
||||
|
||||
// Pack buffer for B.
|
||||
float* pack_b_buffer_f32f32f32of32;
|
||||
@@ -519,8 +530,8 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
|
||||
bool invoke_rd = FALSE;
|
||||
|
||||
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3) &&
|
||||
( ( n < 48 ) || ( m < 16 ) ) && ( rs_b == 1 ) && ( mtag_b == PACK ) &&
|
||||
( mtag_a == UNPACKED ) )
|
||||
( ( n < 48 ) || ( m < 16 ) ) && ( rs_b == 1 ) &&
|
||||
( mtag_a == UNPACKED ) && ( mtag_b == PACK ) )
|
||||
{
|
||||
invoke_rd = TRUE;
|
||||
mtag_b = UNPACKED;
|
||||
@@ -558,7 +569,7 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
|
||||
is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
|
||||
post_ops_attr.is_last_k = is_last_k;
|
||||
|
||||
if ( ( mtag_b == PACK ) && ( should_pack_B == TRUE ) )
|
||||
if ( ( mtag_b == PACK ) || ( should_pack_B == TRUE ) )
|
||||
{
|
||||
// Pack B chunks are based on jc work id.
|
||||
dim_t jc_work_id = bli_thread_work_id( &thread_jc );
|
||||
@@ -744,7 +755,7 @@ LPGEMM_5LOOP(float, float, float, f32f32f32of32)
|
||||
}
|
||||
|
||||
// Release pack buffers.
|
||||
if ( ( mtag_b == PACK ) && ( should_pack_B == TRUE ) )
|
||||
if ( ( mtag_b == PACK ) || ( should_pack_B == TRUE ) )
|
||||
{
|
||||
// All threads in work group should wait till B matrix usage is
|
||||
// completed by the participating threads.
|
||||
|
||||
@@ -141,6 +141,7 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
float* pack_b_buffer_f32f32f32of32 = NULL;
|
||||
err_t err = BLIS_SUCCESS;
|
||||
|
||||
|
||||
if(n == 1)
|
||||
{
|
||||
dim_t MR;
|
||||
@@ -168,8 +169,9 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#endif
|
||||
// Pack B matrix if rs_b > 1
|
||||
if( ( mtag_b == PACK ) && ( rs_b != 1 ) )
|
||||
// Pack B matrix if rs_b > 1, ignoring the mtag_b here.
|
||||
// For tiny sizes, it is better to pack B if it affects output accuracy.
|
||||
if( ( rs_b != 1 ) )
|
||||
{
|
||||
siz_t mem_b_size_req = sizeof( float ) * k;
|
||||
pack_b_buffer_f32f32f32of32 =
|
||||
@@ -185,7 +187,8 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
cs_b_use = 1;
|
||||
}
|
||||
|
||||
if( ( mtag_a == PACK ) && ( cs_a != 1 ) )
|
||||
// For tiny sizes, it is better to pack A if it affects output accuracy.
|
||||
if( ( cs_a != 1 ) )
|
||||
{
|
||||
siz_t mem_a_size_req = sizeof(float) * m * k;
|
||||
pack_a_buffer_f32f32f32of32 =
|
||||
@@ -248,7 +251,8 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
#else
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
|
||||
#endif
|
||||
if( mtag_a == PACK && cs_a != 1)
|
||||
// For tiny sizes, it is better to pack A if it affects output accuracy.
|
||||
if( ( cs_a != 1 ) )
|
||||
{
|
||||
siz_t mem_a_size_req = sizeof( float ) * k;
|
||||
pack_a_buffer_f32f32f32of32 =
|
||||
@@ -262,7 +266,7 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
cs_a_use = 1;
|
||||
}
|
||||
|
||||
if (mtag_b == PACK)
|
||||
if ( ( mtag_b == PACK ) )
|
||||
{
|
||||
dim_t nc0_updated = make_multiple_of_n(n, NR);
|
||||
siz_t mem_b_size_req = sizeof(float) * nc0_updated * k;
|
||||
@@ -324,7 +328,7 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
|
||||
LPGEMM_TINY(float,float,float,f32f32f32of32)
|
||||
{
|
||||
// Handle using LPGEMV when m or/and n equal to 1
|
||||
// Handle using LPGEMV when m or/and n equal to 1
|
||||
if ( ( ( ( m == 1 ) || ( n == 1 ) ) ) && ( ( bli_cpuid_is_avx512_supported() == TRUE ) ||
|
||||
( bli_cpuid_is_avx2fma3_supported() == TRUE ) ) )
|
||||
{
|
||||
@@ -384,7 +388,7 @@ LPGEMM_TINY(float,float,float,f32f32f32of32)
|
||||
// Even if the mtag_b is set to PACK, for tiny sizes its better to
|
||||
// pack only if it affects output accuracy (like column major B),
|
||||
// else ignore it.
|
||||
if ( ( mtag_b == PACK ) && ( rs_b == 1 ) )
|
||||
if ( ( mtag_b == PACK ) )
|
||||
{
|
||||
dim_t nc0_updated = make_multiple_of_n( n, NR );
|
||||
mem_b_size_req = sizeof( float ) * nc0_updated * k;
|
||||
|
||||
Reference in New Issue
Block a user