mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Bug Fixes in LPGEMM for AVX512(SkyLake) machine (#24)
* Bug Fixes in LPGEMM for AVX512(SkyLake) machine
- B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
doesn't support BF16 instructions, the BF16 input is unre-ordered and
converted to FP32 to use FP32 kernels.
- For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
matrix to the re-ordered buffer array. But the un-reordering to FP32
requires the matrix to have size multiple of 16 along n and multiple
of 2 along k dimension.
- The entry condition to the above has been modified for AVX512 configuration.
- In bf16 API, the tiny path entry check has been modified to prevent
seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting
machines.
- Modified existing store instructions in FP32 AVX512 kernels to support
execution in machines that has AVX512 support but not BF16/VNNI(SkyLake).
- Added Bf16 beta and store types in FP32 avx512_256 kernels
AMD Internal: [SWLCSG-3552]
* Bug Fixes in LPGEMM for AVX512(SkyLake) machine
- B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
doesn't support BF16 instructions, the BF16 input is unre-ordered and
converted to FP32 to use FP32 kernels.
- For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
matrix to the re-ordered buffer array. But the un-reordering to FP32
requires the matrix to have size multiple of 16 along n and multiple
of 2 along k dimension.
- The entry condition to the above has been modified for AVX512 configuration.
- In bf16 API, the tiny path entry check has been modified to prevent
seg fault while AOCL_ENABLE_INSTRUCTIONS=AVX2 is set in BF16 supporting
machines.
- Modified existing store instructions in FP32 AVX512 kernels to support
execution in machines that has AVX512 support but not BF16/VNNI(SkyLake).
- Added Bf16 beta and store types, along with BIAS and ZP in FP32 avx512_256
kernels
AMD Internal: [SWLCSG-3552]
* Bug Fixes in LPGEMM for AVX512(SkyLake) machine
- Support added in FP32 512_256 kerenls for : Beta, BIAS, Zero-point and
BF16 store types for bf16bf16f32obf16 API execution in AVX2 mode.
- B-matrix in bf16bf16f32obf16/f32 API is re-ordered. For machines that
doesn't support BF16 instructions, the BF16 input is unre-ordered and
converted to FP32 type to use FP32 kernels.
- For n = 1 and k = 1 sized matrices, re-ordering in BF16 is copying the
matrix to the re-ordered buffer array. But the un-reordering to FP32
requires the matrix to have size multiple of 16 along n and multiple
of 2 along k dimension. The entry condition here has been modified for
AVX512 configuration.
- Fix for seg fault with AOCL_ENABLE_INSTRUCTIONS=AVX2 mode in BF16/VNNI
ISA supporting configruations:
- BF16 tiny path entry check has been modified to take into account arch_id
to ensure improper entry into the tiny kernel.
- The store in BF16->FP32 col-major for m = 1 conditions were updated to
correct storage pattern,
- BF16 beta load macro was modified to account for data in unaligned memory.
- Modified existing store instructions in FP32 AVX512 kernels to support
execution in machines that has AVX512 support but not BF16/VNNI(SkyLake)
AMD Internal: [SWLCSG-3552]
---------
Co-authored-by: VarshaV <varshav2@amd.com>
This commit is contained in:
@@ -100,7 +100,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32_reference)
|
||||
}
|
||||
|
||||
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
|
||||
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
|
||||
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
|
||||
{
|
||||
if( rs_b == 1 )
|
||||
{
|
||||
@@ -260,7 +260,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
|
||||
/*It is expected that while bf16 input is passed to AVX2 kernels,
|
||||
the unreorder/conversion of bf16->f32 is done, which expects the
|
||||
reordered matrix to be padded with n multiple of 16, k multiple of 2. */
|
||||
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
|
||||
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
|
||||
{
|
||||
n_reorder = 1;
|
||||
}
|
||||
@@ -271,7 +271,7 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
|
||||
|
||||
// Extra space since packing does length in multiples of 2.
|
||||
dim_t k_reorder;
|
||||
if( ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) && ( n == 1 ) )
|
||||
if( ( n == 1 ) && ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( lpgemm_get_enabled_arch() != BLIS_ARCH_ZEN3 ) )
|
||||
{
|
||||
k_reorder = k;
|
||||
}
|
||||
@@ -342,7 +342,6 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
|
||||
"cannot perform bf16bf16f32/f32f32f32 gemm.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
aocl_reorder_bf16bf16f32of32_reference( order,trans ,mat_type, input_buf_addr,
|
||||
reorder_buf_addr, k, n, ldb );
|
||||
|
||||
@@ -755,5 +754,4 @@ AOCL_GEMM_REORDER(int8_t, bf16s4f32of32)
|
||||
b.mat_type = input_mat_type;
|
||||
|
||||
reorderb_nr64_bf16s4f32of32(&b, &b_reorder, &rntm_g, lcntx_g);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -229,10 +229,14 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
|
||||
|
||||
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
|
||||
|
||||
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( is_single_thread( &rntm_g ) == TRUE) )
|
||||
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
|
||||
* with only the ISA check the exeution could enter tiny path and result in seg fault
|
||||
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
|
||||
* verified here.
|
||||
*/
|
||||
arch_t arch_id = bli_arch_query_id();
|
||||
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) && ( is_single_thread( &rntm_g ) == TRUE) )
|
||||
{
|
||||
if( ( is_row_major == TRUE ) &&
|
||||
( is_tiny_input_bf16obf16( m, n, k, lcntx_g ) == TRUE ) )
|
||||
@@ -326,4 +330,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
|
||||
err_hndl:;
|
||||
LPGEMM_STOP_LOGGER();
|
||||
}
|
||||
}
|
||||
@@ -235,8 +235,13 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
|
||||
|
||||
#if (defined(BLIS_KERNELS_ZEN4) && (!defined(LPGEMM_BF16_JIT)))
|
||||
|
||||
if ( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) &&
|
||||
/* While AOCL_ENABLE_INSTRUCTIONS=AVX2 is enabled in machines that supports BF16/VNNI
|
||||
* with only the ISA check the exeution could enter tiny path and result in seg fault
|
||||
* as the tiny path for BF16->FP32 is not available. Hence the arch_id also has to be
|
||||
* verified here.
|
||||
*/
|
||||
arch_t arch_id = bli_arch_query_id();
|
||||
if( ( bli_cpuid_is_avx512bf16_supported() == TRUE ) && ( ( arch_id == BLIS_ARCH_ZEN4 ) || ( arch_id == BLIS_ARCH_ZEN5 ) ) &&
|
||||
( is_tiny_input_bf16of32( m, n, k, lcntx_g ) == TRUE ) &&
|
||||
( is_single_thread( &rntm_g ) == TRUE) &&
|
||||
( is_row_major == TRUE ) )
|
||||
@@ -315,4 +320,4 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
|
||||
err_hndl:;
|
||||
LPGEMM_STOP_LOGGER();
|
||||
}
|
||||
}
|
||||
@@ -799,6 +799,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
|
||||
dim_t ic_start, ic_end;
|
||||
bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
|
||||
|
||||
for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
|
||||
{
|
||||
dim_t nc0 = bli_min( ( jc_end - jc ), NC );
|
||||
@@ -898,7 +899,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
bli_thread_ocomm_id( &thread_ic ),
|
||||
&thread->comm[jc_work_id]
|
||||
);
|
||||
|
||||
if ( mtag_b == PACK )
|
||||
{
|
||||
cvt_b_buffer_bf16_f32 =
|
||||
@@ -914,6 +914,7 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
&thread_ic, nc0, NR, FALSE,
|
||||
&jc_packb_start, &jc_packb_end
|
||||
);
|
||||
|
||||
// Ensure thread ranges are valid, especially cases where no:
|
||||
// of threads available for parallelization are greater than
|
||||
// no: of B panel NR chunks.
|
||||
@@ -1014,7 +1015,6 @@ LPGEMM_5LOOP_AVX2(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
mem_a_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_a, rntm
|
||||
);
|
||||
|
||||
// For packed or unpacked A matrix, the mc0 * kc0 block is
|
||||
//converted to F32, i.e., packing has to be done by default
|
||||
cvt_a_buffer_bf16_f32 =
|
||||
|
||||
Reference in New Issue
Block a user