Bug Fixes in LPGEMM for AVX512(SkyLake) machine (#24)

* Bug Fixes in LPGEMM for AVX512(SkyLake) machine

 - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
  doesn't support BF16 instructions, the BF16 input is unre-ordered and
  converted to FP32 to use FP32 kernels.

 - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
  matrix to the re-ordered buffer array. But the un-reordering to FP32
  requires the matrix to have size multiple of 16 along n and multiple
  of 2 along k dimension.

 - The entry condition to the above has been modified for AVX512 configuration.

 - In bf16 API, the tiny path entry check has been modified to prevent
  seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting
  machines.

 - Modified existing store instructions in FP32 AVX512 kernels to support
  execution in machines that has AVX512 support but not BF16/VNNI(SkyLake).

 - Added Bf16 beta and store types in FP32 avx512_256 kernels

AMD Internal: [SWLCSG-3552]

* Bug Fixes in LPGEMM for AVX512(SkyLake) machine

 - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
  doesn't support BF16 instructions, the BF16 input is unre-ordered and
  converted to FP32 to use FP32 kernels.

 - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
  matrix to the re-ordered buffer array. But the un-reordering to FP32
  requires the matrix to have size multiple of 16 along n and multiple
  of 2 along k dimension.

 - The entry condition to the above has been modified for AVX512 configuration.

 - In bf16 API, the tiny path entry check has been modified to prevent
  seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting
  machines.

 - Modified existing store instructions in FP32 AVX512 kernels to support
  execution in machines that has AVX512 support but not BF16/VNNI(SkyLake).

 - Added Bf16 beta and store types, along with BIAS and ZP in FP32 avx512_256
  kernels

AMD Internal: [SWLCSG-3552]

* Bug Fixes in LPGEMM for AVX512(SkyLake) machine

 - Support added in FP32 512_256 kerenls for : Beta, BIAS, Zero-point and
   BF16 store types for bf16bf16f32obf16 API execution in AVX2 mode.

 - B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
  doesn't support BF16 instructions, the BF16 input is unre-ordered and
  converted to FP32 type to use FP32 kernels.

 - For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
  matrix to the re-ordered buffer array. But the un-reordering to FP32
  requires the matrix to have size multiple of 16 along n and multiple
  of 2 along k dimension. The entry condition here has been modified for
  AVX512 configuration.

 - Fix for seg fault with AOCL_ENABLE_INSTRUCTIONS=AVX2 mode in BF16/VNNI
   ISA supporting configruations:
   - BF16 tiny path entry check has been modified to take into account arch_id
     to ensure improper entry into the tiny kernel.
   - The store in BF16->FP32 col-major for m = 1 conditions were updated to
     correct storage pattern,
   - BF16 beta load macro was modified to account for data in unaligned memory.

 - Modified existing store instructions in FP32 AVX512 kernels to support
  execution in machines that has AVX512 support but not BF16/VNNI(SkyLake)

AMD Internal: [SWLCSG-3552]

---------

Co-authored-by: VarshaV <varshav2@amd.com>
This commit is contained in:
V, Varsha
2025-05-30 17:22:49 +05:30
committed by GitHub
parent 62d4fcb398
commit 532eab12d3
12 changed files with 1390 additions and 738 deletions

View File

@@ -100,7 +100,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32_reference)
}
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
{
if( rs_b == 1 )
{
@@ -260,7 +260,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
/*It is expected that while bf16 input is passed to AVX2 kernels,
the unreorder/conversion of bf16->f32 is done, which expects the
reordered matrix to be padded with n multiple of 16, k multiple of 2. */
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
{
n_reorder = 1;
}
@@ -271,7 +271,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
// Extra space since packing does length in multiples of 2.
dim_t k_reorder;
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
{
k_reorder = k;
}
@@ -342,7 +342,6 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
"cannot perform bf16bf16f32/f32f32f32 gemm.", __FILE__, __LINE__ );
return; // Error.
}
aocl_reorder_bf16bf16f32of32_reference( order,trans ,mat_type, input_buf_addr,
reorder_buf_addr, k, n, ldb );
@@ -755,5 +754,4 @@ AOCL_GEMM_REORDER(int8_t, bf16s4f32of32)
b.mat_type = input_mat_type;
reorderb_nr64_bf16s4f32of32(&b, &b_reorder, &rntm_g, lcntx_g);
}
}

View File

@@ -229,10 +229,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( is_single_thread( &rntm_g ) == TRUE) )
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
* with only the ISA check the exeution could enter tiny path and result in seg fault
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
* verified here.
*/
arch_t arch_id = bli_arch_query_id();
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) && ( is_single_thread( &rntm_g ) == TRUE) )
{
if( ( is_row_major == TRUE ) &&
( is_tiny_input_bf16obf16( m, n, k, lcntx_g ) == TRUE ) )
@@ -326,4 +330,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
err_hndl:;
LPGEMM_STOP_LOGGER();
}
}

View File

@@ -235,8 +235,13 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
if ( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) &&
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
* with only the ISA check the exeution could enter tiny path and result in seg fault
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
* verified here.
*/
arch_t arch_id = bli_arch_query_id();
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) &&
( is_tiny_input_bf16of32( m, n, k, lcntx_g ) == TRUE ) &&
( is_single_thread( &rntm_g ) == TRUE) &&
( is_row_major == TRUE ) )
@@ -315,4 +320,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
err_hndl:;
LPGEMM_STOP_LOGGER();
}
}

View File

@@ -799,6 +799,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
dim_t ic_start, ic_end;
bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
{
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
@@ -898,7 +899,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
bli_thread_ocomm_id( &thread_ic ),
&thread->comm[jc_work_id]
);
if ( mtag_b == PACK )
{
cvt_b_buffer_bf16_f32 =
@@ -914,6 +914,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
&thread_ic, nc0, NR, FALSE,
&jc_packb_start, &jc_packb_end
);
// Ensure thread ranges are valid, especially cases where no:
// of threads available for parallelization are greater than
// no: of B panel NR chunks.
@@ -1014,7 +1015,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
mem_a_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_a, rntm
);
// For packed or unpacked A matrix, the mc0 * kc0 block is
//converted to F32, i.e., packing has to be done by default
cvt_a_buffer_bf16_f32 =