Adding support for AOCL_ENABLE_INSTRUCTIONS for f32 LPGEMM API.

-Currently lpgemm sets the context (block sizes and micro-kernels) based
on the ISA of the machine it is being executed on. However this approach
does not give the flexibility to select a different context at runtime.
In order to enable runtime selection of context, the context
initialization is modified to read the AOCL_ENABLE_INSTRUCTIONS env
variable and set the context based on the same. As part of this commit,
only f32 context selection is enabled.
-Bug fixes in scale ops in f32 micro-kernels and GEMV path selection.
-Added vectorized f32 packing kernels for NR=16(AVX2) and NR=64(AVX512).
This is only for B matrix and helps remove dependency of f32 lpgemm api
on the BLIS packing framework.

AMD Internal: [CPUPL-5959]

Change-Id: I4b459aaf33c54423952f89905ba43cf119ce20f6
This commit is contained in:
Mithun Mohan
2024-10-28 06:38:57 +00:00
parent 9ce2696fc9
commit 097cda9f9e
18 changed files with 1374 additions and 439 deletions

View File

@@ -503,7 +503,6 @@ BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
}
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( BF16BF16F32OF32 );
dim_t mr_blks = ( m + MR - 1 ) / MR;
@@ -558,22 +557,17 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
rntm_t* rntm_g
)
{
// Query the global cntx.
cntx_t* cntx = bli_gks_query_cntx();
num_t dt = BLIS_FLOAT;
// Query the context for SUP limits.
const dim_t MT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
const dim_t NT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
const dim_t KT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
const dim_t MT = lpgemm_get_sup_thres_MT_global_cntx( F32F32F32OF32 );
const dim_t NT = lpgemm_get_sup_thres_NT_global_cntx( F32F32F32OF32 );
const dim_t KT = lpgemm_get_sup_thres_KT_global_cntx( F32F32F32OF32 );
// Query the context for various blocksizes.
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
dim_t NR = lpgemm_get_block_size_NR_global_cntx( F32F32F32OF32 );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( F32F32F32OF32 );
dim_t MC = lpgemm_get_block_size_MC_global_cntx( F32F32F32OF32 );
dim_t NC = lpgemm_get_block_size_NC_global_cntx( F32F32F32OF32 );
dim_t KC = lpgemm_get_block_size_KC_global_cntx( F32F32F32OF32 );
const dim_t MT_2 = MT / 2;
@@ -640,7 +634,7 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
if ( ( m >= MT ) && ( n >= NT ) && ( k >= KT ) )
{
if (((k <= page_size_b_floatx2) && (m_ic > MT_2) && (n_jc >= NT)) ||
if (((k >= page_size_b_floatx2) && (m_ic > MT_2) && (n_jc >= NT)) ||
((bli_cpuid_is_avx512_supported() == FALSE) && (k > page_size_b_floatx2)))
{
bli_rntm_set_pack_b( 1, rntm_g );