mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Bugfix : Guarded AOCL_ENABLE_INSTRUCTONS support based on AVX512-ISA support
- As part of rerouting to AVX2 code-paths on ZEN4/ZEN5(or similar) architectures, the code-base established a contingency when deploying fat binary on ZEN/ZEN2/ZEN3 systems. Due to this, it was required that we always set AOCL_ENABLE_INSTRUCTIONS to 'ZEN3'(or similar values) to make sure we don't run AVX512 code on such architectures. This issue existed on FP32 and BF16 APIs. - Added checks to detect the AVX512-ISA support to enable rerouting based on AOCL_ENABLE_INSTRUCTIONS. This removes the incorrect constraint that was put forth. AMD-Internal: [CPUPL-7020] Co-authored-by: Vignesh Balasubramanian <vignbala@amd.com>
This commit is contained in:
committed by
GitHub
parent
8a86620753
commit
93414f56c8
@@ -823,20 +823,27 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
{
|
||||
lpgemv_n_one_ker_ft ker_fp;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
f32_MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
f32_MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
|
||||
}
|
||||
else
|
||||
{
|
||||
f32_MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
f32_MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32;
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
// Increased MR from 6 to 16 to make use of 32 ZMM registers
|
||||
f32_MR = 8;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#endif
|
||||
// for bf16 inputs no matter if it's packed/re-ordered and unpacked,
|
||||
// the matrix to be given to the kernels has to be in bf16.
|
||||
@@ -944,16 +951,23 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
|
||||
lpgemv_m_one_ker_ft ker_fp;
|
||||
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32;
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#endif
|
||||
// Compute the JC loop thread range for the current thread.
|
||||
dim_t jc_start, jc_end;
|
||||
|
||||
@@ -174,23 +174,32 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
// Workaround to select right kernel and blocksizes based on arch
|
||||
// since GEMV parameters are not available in lpgemm context.
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
// Runtime check for AVX512 ISA support.
|
||||
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
|
||||
// underlying architecture supports AVX512 ISA.
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
}
|
||||
else
|
||||
{
|
||||
MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32;
|
||||
packa_fp = packa_mr16_f32f32f32of32_col_major;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
MR = 16;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32;
|
||||
packa_fp = packa_mr16_f32f32f32of32_col_major;
|
||||
#endif
|
||||
MR = 8;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#else
|
||||
// Increased MR from 6 to 16 to make use of 32 ZMM registers
|
||||
MR = 8;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#endif
|
||||
// Pack B matrix if rs_b > 1
|
||||
if( rs_b != 1 )
|
||||
@@ -276,21 +285,31 @@ LPGEMV(float, float, float, f32f32f32of32)
|
||||
lpgemv_m_one_ker_ft ker_fp;
|
||||
lpgemv_a_pack_ft packa_fp;
|
||||
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
// Runtime check for AVX512 ISA support.
|
||||
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
|
||||
// underlying architecture supports AVX512 ISA.
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32;
|
||||
packa_fp = packa_mr16_f32f32f32of32_col_major;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32;
|
||||
packa_fp = packa_mr16_f32f32f32of32_col_major;
|
||||
#endif
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#else
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#endif
|
||||
#endif
|
||||
// Compute the JC loop thread range for the current thread.
|
||||
dim_t jc_start, jc_end;
|
||||
thread_jc.n_way = ( thread_jc.n_way == 1 ) ?
|
||||
|
||||
@@ -151,6 +151,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
// Workaround to select right kernel and blocksizes based on arch
|
||||
// since GEMV parameters are not available in lpgemm context.
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
// Runtime check for AVX512 ISA support.
|
||||
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
|
||||
// underlying architecture supports AVX512 ISA.
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
MR = 16;
|
||||
@@ -163,11 +168,16 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32;
|
||||
packa_fp = packa_mr16_f32f32f32of32_col_major;
|
||||
}
|
||||
#else
|
||||
}
|
||||
else
|
||||
{
|
||||
#endif
|
||||
// Increased MR from 6 to 16 to make use of 32 ZMM registers
|
||||
MR = 8;
|
||||
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
|
||||
packa_fp = packa_mr8_f32f32f32of32_col_major;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#endif
|
||||
// Pack B matrix if rs_b > 1, ignoring the mtag_b here.
|
||||
// For tiny sizes, it is better to pack B if it affects output accuracy.
|
||||
@@ -240,6 +250,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
lpgemv_m_one_ker_ft ker_fp;
|
||||
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
// Runtime check for AVX512 ISA support.
|
||||
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
|
||||
// underlying architecture supports AVX512 ISA.
|
||||
if( bli_cpuid_is_avx512_supported() == TRUE )
|
||||
{
|
||||
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
|
||||
@@ -248,8 +263,13 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
|
||||
{
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32;
|
||||
}
|
||||
#else
|
||||
}
|
||||
else
|
||||
{
|
||||
#endif
|
||||
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
|
||||
#ifdef BLIS_KERNELS_ZEN4
|
||||
}
|
||||
#endif
|
||||
// For tiny sizes, it is better to pack A if it affects output accuracy.
|
||||
if( ( cs_a != 1 ) )
|
||||
|
||||
Reference in New Issue
Block a user