Bugfix : Guarded AOCL_ENABLE_INSTRUCTONS support based on AVX512-ISA support

- As part of rerouting to AVX2 code-paths on ZEN4/ZEN5(or similar)
  architectures, the code-base established a contingency when
  deploying fat binary on ZEN/ZEN2/ZEN3 systems. Due to this,
  it was required that we always set AOCL_ENABLE_INSTRUCTIONS to
  'ZEN3'(or similar values) to make sure we don't run AVX512
  code on such architectures. This issue existed on FP32 and BF16
  APIs.

- Added checks to detect the AVX512-ISA support to enable rerouting
  based on AOCL_ENABLE_INSTRUCTIONS. This removes the incorrect
  constraint that was put forth.

AMD-Internal: [CPUPL-7020]

Co-authored-by: Vignesh Balasubramanian <vignbala@amd.com>
This commit is contained in:
Balasubramanian, Vignesh
2025-07-24 12:20:05 +05:30
committed by GitHub
parent 8a86620753
commit 93414f56c8
3 changed files with 89 additions and 36 deletions

View File

@@ -823,20 +823,27 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
{
lpgemv_n_one_ker_ft ker_fp;
#ifdef BLIS_KERNELS_ZEN4
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
if( bli_cpuid_is_avx512_supported() == TRUE )
{
f32_MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
f32_MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
}
else
{
f32_MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32;
}
}
else
{
f32_MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32;
}
#else
#endif
// Increased MR from 6 to 16 to make use of 32 ZMM registers
f32_MR = 8;
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
#ifdef BLIS_KERNELS_ZEN4
}
#endif
// for bf16 inputs no matter if it's packed/re-ordered and unpacked,
// the matrix to be given to the kernels has to be in bf16.
@@ -944,16 +951,23 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
lpgemv_m_one_ker_ft ker_fp;
#ifdef BLIS_KERNELS_ZEN4
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
if( bli_cpuid_is_avx512_supported() == TRUE )
{
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
}
else
{
ker_fp = lpgemv_m_one_f32f32f32of32;
}
}
else
{
ker_fp = lpgemv_m_one_f32f32f32of32;
}
#else
#endif
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
#ifdef BLIS_KERNELS_ZEN4
}
#endif
// Compute the JC loop thread range for the current thread.
dim_t jc_start, jc_end;

View File

@@ -174,23 +174,32 @@ LPGEMV(float, float, float, f32f32f32of32)
// Workaround to select right kernel and blocksizes based on arch
// since GEMV parameters are not available in lpgemm context.
#ifdef BLIS_KERNELS_ZEN4
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
// Runtime check for AVX512 ISA support.
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
// underlying architecture supports AVX512 ISA.
if( bli_cpuid_is_avx512_supported() == TRUE )
{
MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
packa_fp = packa_mr8_f32f32f32of32_col_major;
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
packa_fp = packa_mr8_f32f32f32of32_col_major;
}
else
{
MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32;
packa_fp = packa_mr16_f32f32f32of32_col_major;
}
}
else
{
MR = 16;
ker_fp = lpgemv_n_one_f32f32f32of32;
packa_fp = packa_mr16_f32f32f32of32_col_major;
#endif
MR = 8;
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
packa_fp = packa_mr8_f32f32f32of32_col_major;
#ifdef BLIS_KERNELS_ZEN4
}
#else
// Increased MR from 6 to 16 to make use of 32 ZMM registers
MR = 8;
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
packa_fp = packa_mr8_f32f32f32of32_col_major;
#endif
// Pack B matrix if rs_b > 1
if( rs_b != 1 )
@@ -276,21 +285,31 @@ LPGEMV(float, float, float, f32f32f32of32)
lpgemv_m_one_ker_ft ker_fp;
lpgemv_a_pack_ft packa_fp;
#ifdef BLIS_KERNELS_ZEN4
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
#ifdef BLIS_KERNELS_ZEN4
// Runtime check for AVX512 ISA support.
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
// underlying architecture supports AVX512 ISA.
if( bli_cpuid_is_avx512_supported() == TRUE )
{
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
packa_fp = packa_mr8_f32f32f32of32_col_major;
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
packa_fp = packa_mr8_f32f32f32of32_col_major;
}
else
{
ker_fp = lpgemv_m_one_f32f32f32of32;
packa_fp = packa_mr16_f32f32f32of32_col_major;
}
}
else
{
ker_fp = lpgemv_m_one_f32f32f32of32;
packa_fp = packa_mr16_f32f32f32of32_col_major;
#endif
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
packa_fp = packa_mr8_f32f32f32of32_col_major;
#ifdef BLIS_KERNELS_ZEN4
}
#else
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
packa_fp = packa_mr8_f32f32f32of32_col_major;
#endif
#endif
// Compute the JC loop thread range for the current thread.
dim_t jc_start, jc_end;
thread_jc.n_way = ( thread_jc.n_way == 1 ) ?

View File

@@ -151,6 +151,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
// Workaround to select right kernel and blocksizes based on arch
// since GEMV parameters are not available in lpgemm context.
#ifdef BLIS_KERNELS_ZEN4
// Runtime check for AVX512 ISA support.
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
// underlying architecture supports AVX512 ISA.
if( bli_cpuid_is_avx512_supported() == TRUE )
{
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
MR = 16;
@@ -163,11 +168,16 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
ker_fp = lpgemv_n_one_f32f32f32of32;
packa_fp = packa_mr16_f32f32f32of32_col_major;
}
#else
}
else
{
#endif
// Increased MR from 6 to 16 to make use of 32 ZMM registers
MR = 8;
ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
packa_fp = packa_mr8_f32f32f32of32_col_major;
#ifdef BLIS_KERNELS_ZEN4
}
#endif
// Pack B matrix if rs_b > 1, ignoring the mtag_b here.
// For tiny sizes, it is better to pack B if it affects output accuracy.
@@ -240,6 +250,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
lpgemv_m_one_ker_ft ker_fp;
#ifdef BLIS_KERNELS_ZEN4
// Runtime check for AVX512 ISA support.
// We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
// underlying architecture supports AVX512 ISA.
if( bli_cpuid_is_avx512_supported() == TRUE )
{
if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
{
ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
@@ -248,8 +263,13 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
{
ker_fp = lpgemv_m_one_f32f32f32of32;
}
#else
}
else
{
#endif
ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
#ifdef BLIS_KERNELS_ZEN4
}
#endif
// For tiny sizes, it is better to pack A if it affects output accuracy.
if( ( cs_a != 1 ) )