Fix for x86_64 builds

Configuration x86_64 includes all Intel and AMD sub-configurations. Fixes to enable this to work correctly again are: - In config_registry use amdzen rather than amd64 in x86_64 family. - Copy settings from config/amdzen/bli_family_amdzen.h to config/x86_64/bli_family_x86_64.h - Modify configure to set enable_aocl_zen=yes for x86_64, but not for amd64_legacy. - Add "if defined(BLIS_FAMILY_X86_64)" to frame/3/bli_l3_sup.c and frame/3/bli_l3_sup_int_amd.c so zen-specific code paths are enabled. Note: sub-configurations knl and bulldozer use instructions that are not supported on most x86_64 processors. AMD-Internal: [CPUPL-3838] Change-Id: I0bd8fd89ccd846f80e5491ef44ade7d409970b04
2026-05-11 17:50:00 +00:00 · 2023-10-04 00:18:57 +05:30
parent 5d578684ea
commit 85f2bf6c4a
5 changed files with 33 additions and 11 deletions
--- a/config/x86_64/bli_family_x86_64.h
+++ b/config/x86_64/bli_family_x86_64.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -32,10 +33,30 @@

 */

-//#ifndef BLIS_FAMILY_H
-//#define BLIS_FAMILY_H
+#ifndef BLIS_FAMILY_H
+#define BLIS_FAMILY_H

+// By default, it is effective to parallelize the outer loops.
+// Setting these macros to 1 will force JR and IR inner loops
+// to be not parallelized.
+//
+#define BLIS_THREAD_MAX_IR      1
+#define BLIS_THREAD_MAX_JR      1

+#define BLIS_ENABLE_SMALL_MATRIX
+#define BLIS_ENABLE_SMALL_MATRIX_TRSM

-//#endif
+// This will select the threshold below which small matrix code will be called.
+#define BLIS_SMALL_MATRIX_THRES        700
+#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
+#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
+
+#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96
+#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128
+
+// When running HPL with pure MPI without DGEMM threading (Single-threaded
+// BLIS), defining this macro as 1 yields better performance.
+#define AOCL_BLIS_MULTIINSTANCE   0
+
+#endif

--- a/2
+++ b/2
@@ -8,7 +8,7 @@
 #

 # Processor families.
-x86_64:         intel64 amd64 amd64_legacy
+x86_64:         intel64 amdzen amd64_legacy
 intel64:        skx knl haswell sandybridge penryn generic
 amd64_legacy:   excavator steamroller piledriver bulldozer generic
 amdzen:         zen4 zen3 zen2 zen generic
--- a/9
+++ b/9
@@ -3332,10 +3332,11 @@ main()
 	uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
 	config_name_define="#define BLIS_FAMILY_${uconf}\n"

-	#create a AOCL specific #define
-	#This macro is enabled only for zen family configurations.
-	#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
-	uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1)
+	# Create a AOCL specific #define
+	# This macro is enabled only for zen family configurations.
+	# This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
+	# Note: amd64_legacy is for pre-zen architectures.
+	uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1)
 	if [[ $uconf == 1 ]]; then
 		enable_aocl_zen='yes'
 		enable_aocl_zen_01=1
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -107,7 +107,7 @@ err_t bli_gemmsup
    if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
    else                { rntm_l = *rntm;                       rntm = &rntm_l; }

-#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
+#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)

    if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
    {
--- a/frame/3/bli_l3_sup_int_amd.c
+++ b/frame/3/bli_l3_sup_int_amd.c
@@ -134,7 +134,7 @@ err_t bli_gemmsup_int
 		  }
 	  }

-#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
+#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)

 	  //Enable packing of B matrix for double data type when dims at per
 	  //thread level are above caches and enable packing of A when transA
@@ -212,7 +212,7 @@ err_t bli_gemmsup_int
 		  }
 	  }

-#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
+#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)

 	  //Enable packing of B matrix for double data type when dims at per
 	  //thread level are above caches and enable packing of A when transA