Bugfix : Guarded AOCL_ENABLE_INSTRUCTONS support based on AVX512-ISA support

- As part of rerouting to AVX2 code-paths on ZEN4/ZEN5(or similar) architectures, the code-base established a contingency when deploying fat binary on ZEN/ZEN2/ZEN3 systems. Due to this, it was required that we always set AOCL_ENABLE_INSTRUCTIONS to 'ZEN3'(or similar values) to make sure we don't run AVX512 code on such architectures. This issue existed on FP32 and BF16 APIs. - Added checks to detect the AVX512-ISA support to enable rerouting based on AOCL_ENABLE_INSTRUCTIONS. This removes the incorrect constraint that was put forth. AMD-Internal: [CPUPL-7020] Co-authored-by: Vignesh Balasubramanian <vignbala@amd.com>
2026-04-20 07:38:53 +00:00 · 2025-07-24 12:20:05 +05:30
parent 8a86620753
commit 93414f56c8
3 changed files with 89 additions and 36 deletions
--- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
+++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
@@ -823,20 +823,27 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
 	{
 		lpgemv_n_one_ker_ft ker_fp;
 #ifdef BLIS_KERNELS_ZEN4
-		if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+		if( bli_cpuid_is_avx512_supported() == TRUE )
 		{
-			f32_MR = 16;
-			ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
+			if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+			{
+				f32_MR = 16;
+				ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
+			}
+			else
+			{
+				f32_MR = 16;
+				ker_fp = lpgemv_n_one_f32f32f32of32;
+			}
 		}
 		else
 		{
-			f32_MR = 16;
-			ker_fp = lpgemv_n_one_f32f32f32of32;
-		}
-#else
+#endif
 		// Increased MR from 6 to 16 to make use of 32 ZMM registers
 		f32_MR = 8;
 		ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
+#ifdef BLIS_KERNELS_ZEN4
+		}
 #endif
 		// for bf16 inputs no matter if it's packed/re-ordered and unpacked,
 		// the matrix to be given to the kernels has to be in bf16.
@@ -944,16 +951,23 @@ LPGEMV_AVX2(bfloat16, bfloat16, float, bf16bf16f32of32)
 		lpgemv_m_one_ker_ft ker_fp;

 #ifdef BLIS_KERNELS_ZEN4
-		if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+		if( bli_cpuid_is_avx512_supported() == TRUE )
 		{
-			ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
+			if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+			{
+				ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
+			}
+			else
+			{
+				ker_fp = lpgemv_m_one_f32f32f32of32;
+			}
 		}
 		else
 		{
-			ker_fp = lpgemv_m_one_f32f32f32of32;
-		}
-#else
+#endif
 			ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
+#ifdef BLIS_KERNELS_ZEN4
+		}
 #endif
 		// Compute the JC loop thread range for the current thread.
 		dim_t jc_start, jc_end;
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
@@ -174,23 +174,32 @@ LPGEMV(float, float, float, f32f32f32of32)
    // Workaround to select right kernel and blocksizes based on arch
    // since GEMV parameters are not available in lpgemm context.
 #ifdef BLIS_KERNELS_ZEN4
-    if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+    // Runtime check for AVX512 ISA support.
+    // We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
+    // underlying architecture supports AVX512 ISA.
+    if( bli_cpuid_is_avx512_supported() == TRUE )
    {
-      MR = 16;
-      ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
-      packa_fp = packa_mr8_f32f32f32of32_col_major;
+      if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+      {
+        MR = 16;
+        ker_fp = lpgemv_n_one_f32f32f32of32_avx512_256;
+        packa_fp = packa_mr8_f32f32f32of32_col_major;
+      }
+      else
+      {
+        MR = 16;
+        ker_fp = lpgemv_n_one_f32f32f32of32;
+        packa_fp = packa_mr16_f32f32f32of32_col_major;
+      }
    }
    else
    {
-      MR = 16;
-      ker_fp = lpgemv_n_one_f32f32f32of32;
-      packa_fp = packa_mr16_f32f32f32of32_col_major;
+#endif
+      MR = 8;
+      ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
+      packa_fp = packa_mr8_f32f32f32of32_col_major;
+#ifdef BLIS_KERNELS_ZEN4
    }
-#else
-    // Increased MR from 6 to 16 to make use of 32 ZMM registers
-    MR = 8;
-    ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
-    packa_fp = packa_mr8_f32f32f32of32_col_major;
 #endif
    // Pack B matrix if rs_b > 1
    if( rs_b != 1 )
@@ -276,21 +285,31 @@ LPGEMV(float, float, float, f32f32f32of32)
    lpgemv_m_one_ker_ft ker_fp;
    lpgemv_a_pack_ft packa_fp;

-    #ifdef BLIS_KERNELS_ZEN4
-    if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+#ifdef BLIS_KERNELS_ZEN4
+    // Runtime check for AVX512 ISA support.
+    // We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
+    // underlying architecture supports AVX512 ISA.
+    if( bli_cpuid_is_avx512_supported() == TRUE )
    {
-      ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
-      packa_fp = packa_mr8_f32f32f32of32_col_major;
+      if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
+      {
+        ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
+        packa_fp = packa_mr8_f32f32f32of32_col_major;
+      }
+      else
+      {
+        ker_fp = lpgemv_m_one_f32f32f32of32;
+        packa_fp = packa_mr16_f32f32f32of32_col_major;
+      }
    }
    else
    {
-      ker_fp = lpgemv_m_one_f32f32f32of32;
-      packa_fp = packa_mr16_f32f32f32of32_col_major;
+#endif
+      ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
+      packa_fp = packa_mr8_f32f32f32of32_col_major;
+#ifdef BLIS_KERNELS_ZEN4
    }
-    #else
-    ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
-    packa_fp = packa_mr8_f32f32f32of32_col_major;
-    #endif
+#endif
    // Compute the JC loop thread range for the current thread.
    dim_t jc_start, jc_end;
    thread_jc.n_way = ( thread_jc.n_way == 1 ) ?
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32_tiny.c
@@ -151,6 +151,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
        // Workaround to select right kernel and blocksizes based on arch
        // since GEMV parameters are not available in lpgemm context.
 #ifdef BLIS_KERNELS_ZEN4
+      // Runtime check for AVX512 ISA support.
+      // We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
+      // underlying architecture supports AVX512 ISA.
+      if( bli_cpuid_is_avx512_supported() == TRUE )
+      {
        if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
        {
          MR = 16;
@@ -163,11 +168,16 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
          ker_fp = lpgemv_n_one_f32f32f32of32;
          packa_fp = packa_mr16_f32f32f32of32_col_major;
        }
-#else
+      }
+      else
+      {
+#endif
        // Increased MR from 6 to 16 to make use of 32 ZMM registers
        MR = 8;
        ker_fp = lpgemv_n_one_f32f32f32of32_avx2;
        packa_fp = packa_mr8_f32f32f32of32_col_major;
+#ifdef BLIS_KERNELS_ZEN4
+      }
 #endif
        // Pack B matrix if rs_b > 1, ignoring the mtag_b here.
        // For tiny sizes, it is better to pack B if it affects output accuracy.
@@ -240,6 +250,11 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
        lpgemv_m_one_ker_ft ker_fp;

 #ifdef BLIS_KERNELS_ZEN4
+      // Runtime check for AVX512 ISA support.
+      // We intend to use AOCL_ENABLE_INSTRUCTIONS only if the
+      // underlying architecture supports AVX512 ISA.
+      if( bli_cpuid_is_avx512_supported() == TRUE )
+      {
        if( lpgemm_get_enabled_arch() == BLIS_ARCH_ZEN3 )
        {
          ker_fp = lpgemv_m_one_f32f32f32of32_avx512_256;
@@ -248,8 +263,13 @@ LPGEMV_TINY(float, float, float, f32f32f32of32)
        {
          ker_fp = lpgemv_m_one_f32f32f32of32;
        }
-#else
+      }
+      else
+      {
+#endif
        ker_fp = lpgemv_m_one_f32f32f32of32_avx2;
+#ifdef BLIS_KERNELS_ZEN4
+      }
 #endif
        // For tiny sizes, it is better to pack A if it affects output accuracy.
        if( ( cs_a != 1 ) )