Adding support for AOCL_ENABLE_INSTRUCTIONS for f32 LPGEMM API.

-Currently lpgemm sets the context (block sizes and micro-kernels) based on the ISA of the machine it is being executed on. However this approach does not give the flexibility to select a different context at runtime. In order to enable runtime selection of context, the context initialization is modified to read the AOCL_ENABLE_INSTRUCTIONS env variable and set the context based on the same. As part of this commit, only f32 context selection is enabled. -Bug fixes in scale ops in f32 micro-kernels and GEMV path selection. -Added vectorized f32 packing kernels for NR=16(AVX2) and NR=64(AVX512). This is only for B matrix and helps remove dependency of f32 lpgemm api on the BLIS packing framework. AMD Internal: [CPUPL-5959] Change-Id: I4b459aaf33c54423952f89905ba43cf119ce20f6
2026-05-05 06:51:11 +00:00 · 2024-10-28 06:38:57 +00:00
parent 9ce2696fc9
commit 097cda9f9e
18 changed files with 1374 additions and 439 deletions
--- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c
+++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c
@@ -503,7 +503,6 @@ BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
 	}
 	else if ( ( *n_threads ) > 1 )
 	{
-
 		dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
 		dim_t MR = lpgemm_get_block_size_MR_global_cntx( BF16BF16F32OF32 );
 		dim_t mr_blks = ( m + MR - 1 ) / MR;
@@ -558,22 +557,17 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
       rntm_t* rntm_g
     )
 {
-	// Query the global cntx.
-	cntx_t* cntx = bli_gks_query_cntx();
-
-	num_t dt = BLIS_FLOAT;
-
 	// Query the context for SUP limits.
-	const dim_t MT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
-	const dim_t NT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
-	const dim_t KT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
+	const dim_t MT = lpgemm_get_sup_thres_MT_global_cntx( F32F32F32OF32 );
+	const dim_t NT = lpgemm_get_sup_thres_NT_global_cntx( F32F32F32OF32 );
+	const dim_t KT = lpgemm_get_sup_thres_KT_global_cntx( F32F32F32OF32 );

 	// Query the context for various blocksizes.
-	const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
-	const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
-	const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
-	const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
-	const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+	dim_t NR = lpgemm_get_block_size_NR_global_cntx( F32F32F32OF32 );
+	dim_t MR = lpgemm_get_block_size_MR_global_cntx( F32F32F32OF32 );
+	dim_t MC = lpgemm_get_block_size_MC_global_cntx( F32F32F32OF32 );
+	dim_t NC = lpgemm_get_block_size_NC_global_cntx( F32F32F32OF32 );
+	dim_t KC = lpgemm_get_block_size_KC_global_cntx( F32F32F32OF32 );

 	const dim_t MT_2 = MT / 2;

@@ -640,7 +634,7 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading

 	if ( ( m >= MT ) && ( n >= NT ) && ( k >= KT ) )
 	{
-		if (((k <= page_size_b_floatx2) && (m_ic > MT_2) && (n_jc >= NT)) ||
+		if (((k >= page_size_b_floatx2) && (m_ic > MT_2) && (n_jc >= NT)) ||
 		    ((bli_cpuid_is_avx512_supported() == FALSE) && (k > page_size_b_floatx2)))
 		{
 			bli_rntm_set_pack_b( 1, rntm_g );