Bug Fixes in LPGEMM for AVX512(SkyLake) machine (#24)

* Bug Fixes in LPGEMM for AVX512(SkyLake) machine - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. - The entry condition to the above has been modified for AVX512 configuration. - In bf16 API, the tiny path entry check has been modified to prevent seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting machines. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake). - Added Bf16 beta and store types in FP32 avx512_256 kernels AMD Internal: [SWLCSG-3552] * Bug Fixes in LPGEMM for AVX512(SkyLake) machine - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. - The entry condition to the above has been modified for AVX512 configuration. - In bf16 API, the tiny path entry check has been modified to prevent seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting machines. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake). - Added Bf16 beta and store types, along with BIAS and ZP in FP32 avx512_256 kernels AMD Internal: [SWLCSG-3552] * Bug Fixes in LPGEMM for AVX512(SkyLake) machine - Support added in FP32 512_256 kerenls for : Beta, BIAS, Zero-point and BF16 store types for bf16bf16f32obf16 API execution in AVX2 mode. - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that doesn't support BF16 instructions, the BF16 input is unre-ordered and converted to FP32 type to use FP32 kernels. - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the matrix to the re-ordered buffer array. But the un-reordering to FP32 requires the matrix to have size multiple of 16 along n and multiple of 2 along k dimension. The entry condition here has been modified for AVX512 configuration. - Fix for seg fault with AOCL_ENABLE_INSTRUCTIONS=AVX2 mode in BF16/VNNI ISA supporting configruations: - BF16 tiny path entry check has been modified to take into account arch_id to ensure improper entry into the tiny kernel. - The store in BF16->FP32 col-major for m = 1 conditions were updated to correct storage pattern, - BF16 beta load macro was modified to account for data in unaligned memory. - Modified existing store instructions in FP32 AVX512 kernels to support execution in machines that has AVX512 support but not BF16/VNNI(SkyLake) AMD Internal: [SWLCSG-3552] --------- Co-authored-by: VarshaV <varshav2@amd.com>
2026-04-20 15:48:50 +00:00 · 2025-05-30 17:22:49 +05:30
parent 62d4fcb398
commit 532eab12d3
12 changed files with 1390 additions and 738 deletions
--- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c
@@ -100,7 +100,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32_reference)
 	}

 #if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
-	if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
+	if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
 	{
 		if( rs_b == 1 )
 		{
@@ -260,7 +260,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
 	/*It is expected that while bf16 input is passed to AVX2 kernels,
 	  the unreorder/conversion of bf16->f32 is done, which expects the
 	  reordered matrix to be padded with n multiple of 16, k multiple of 2. */
-	if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) &&  ( n == 1 ) )
+	if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
 	{
 		n_reorder = 1;
 	}
@@ -271,7 +271,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)

 	// Extra space since packing does length in multiples of 2.
 	dim_t k_reorder;
-	if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
+	if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
 	{
 		k_reorder = k;
 	}
@@ -342,7 +342,6 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
 				"cannot perform bf16bf16f32/f32f32f32 gemm.", __FILE__, __LINE__ );
 			return; // Error.
 		}
-
 		aocl_reorder_bf16bf16f32of32_reference( order,trans ,mat_type, input_buf_addr,
 								reorder_buf_addr, k, n, ldb );

@@ -755,5 +754,4 @@ AOCL_GEMM_REORDER(int8_t, bf16s4f32of32)
 	b.mat_type = input_mat_type;

 	reorderb_nr64_bf16s4f32of32(&b, &b_reorder, &rntm_g, lcntx_g);
-}
-
+}
--- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c
@@ -229,10 +229,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 	bli_pba_rntm_set_pba( &rntm_g );

 	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
-
 #if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
-
-	if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( is_single_thread( &rntm_g ) == TRUE) )
+	/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
+	*  with only the ISA check the exeution could enter tiny path and result in seg fault
+	*  as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
+	*  verified here.
+	*/
+	arch_t arch_id =  bli_arch_query_id();
+	if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) && ( is_single_thread( &rntm_g ) == TRUE) )
 	{
 		if( ( is_row_major == TRUE ) &&
 			( is_tiny_input_bf16obf16( m, n, k, lcntx_g ) == TRUE ) )
@@ -326,4 +330,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)

 err_hndl:;
 	LPGEMM_STOP_LOGGER();
-}
+}
--- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c
@@ -235,8 +235,13 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
 	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );

 #if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
-
-	if ( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) &&
+	/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
+	*  with only the ISA check the exeution could enter tiny path and result in seg fault
+	*  as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
+	*  verified here.
+	*/
+	arch_t arch_id =  bli_arch_query_id();
+	if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) &&
 		 ( is_tiny_input_bf16of32( m, n, k, lcntx_g ) == TRUE ) &&
 		 ( is_single_thread( &rntm_g ) == TRUE) &&
 		 ( is_row_major == TRUE ) )
@@ -315,4 +320,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)

 err_hndl:;
 	LPGEMM_STOP_LOGGER();
-}
+}
--- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
+++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
@@ -799,6 +799,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)

 	dim_t ic_start, ic_end;
 	bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+
 	for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
 	{
 		dim_t nc0 = bli_min( ( jc_end - jc ), NC );
@@ -898,7 +899,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
 				bli_thread_ocomm_id( &thread_ic ),
 				&thread->comm[jc_work_id]
 			);
-
 			if ( mtag_b == PACK )
 			{
 				cvt_b_buffer_bf16_f32 =
@@ -914,6 +914,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
 				  &thread_ic, nc0, NR, FALSE,
 				  &jc_packb_start, &jc_packb_end
 				);
+
 				// Ensure thread ranges are valid, especially cases where no:
 				// of threads available for parallelization are greater than
 				// no: of B panel NR chunks.
@@ -1014,7 +1015,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
 				  mem_a_size_req, BLIS_BUFFER_FOR_GEN_USE,
 				  &mem_a, rntm
 				);
-
 				// For packed or unpacked A matrix, the mc0 * kc0 block is
 				//converted to F32, i.e., packing has to be done by default
 				cvt_a_buffer_bf16_f32 =