Fix for x86_64 builds

Configuration x86_64 includes all Intel and AMD sub-configurations.
Fixes to enable this to work correctly again are:
- In config_registry use amdzen rather than amd64 in x86_64 family.
- Copy settings from config/amdzen/bli_family_amdzen.h to
  config/x86_64/bli_family_x86_64.h
- Modify configure to set enable_aocl_zen=yes for x86_64, but not
  for amd64_legacy.
- Add "if defined(BLIS_FAMILY_X86_64)" to frame/3/bli_l3_sup.c and
  frame/3/bli_l3_sup_int_amd.c so zen-specific code paths are
  enabled.

Note: sub-configurations knl and bulldozer use instructions that are
not supported on most x86_64 processors.

AMD-Internal: [CPUPL-3838]
Change-Id: I0bd8fd89ccd846f80e5491ef44ade7d409970b04
This commit is contained in:
Edward Smyth
2023-10-04 00:18:57 +05:30
parent 5d578684ea
commit 85f2bf6c4a
5 changed files with 33 additions and 11 deletions

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,10 +33,30 @@
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#ifndef BLIS_FAMILY_H
#define BLIS_FAMILY_H
// By default, it is effective to parallelize the outer loops.
// Setting these macros to 1 will force JR and IR inner loops
// to be not parallelized.
//
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
//#endif
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96
#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128
// When running HPL with pure MPI without DGEMM threading (Single-threaded
// BLIS), defining this macro as 1 yields better performance.
#define AOCL_BLIS_MULTIINSTANCE 0
#endif

View File

@@ -8,7 +8,7 @@
#
# Processor families.
x86_64: intel64 amd64 amd64_legacy
x86_64: intel64 amdzen amd64_legacy
intel64: skx knl haswell sandybridge penryn generic
amd64_legacy: excavator steamroller piledriver bulldozer generic
amdzen: zen4 zen3 zen2 zen generic

9
configure vendored
View File

@@ -3332,10 +3332,11 @@ main()
uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
config_name_define="#define BLIS_FAMILY_${uconf}\n"
#create a AOCL specific #define
#This macro is enabled only for zen family configurations.
#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1)
# Create a AOCL specific #define
# This macro is enabled only for zen family configurations.
# This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
# Note: amd64_legacy is for pre-zen architectures.
uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1)
if [[ $uconf == 1 ]]; then
enable_aocl_zen='yes'
enable_aocl_zen_01=1

View File

@@ -107,7 +107,7 @@ err_t bli_gemmsup
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
{

View File

@@ -134,7 +134,7 @@ err_t bli_gemmsup_int
}
}
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
//Enable packing of B matrix for double data type when dims at per
//thread level are above caches and enable packing of A when transA
@@ -212,7 +212,7 @@ err_t bli_gemmsup_int
}
}
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
//Enable packing of B matrix for double data type when dims at per
//thread level are above caches and enable packing of A when transA