mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Fix for x86_64 builds
Configuration x86_64 includes all Intel and AMD sub-configurations. Fixes to enable this to work correctly again are: - In config_registry use amdzen rather than amd64 in x86_64 family. - Copy settings from config/amdzen/bli_family_amdzen.h to config/x86_64/bli_family_x86_64.h - Modify configure to set enable_aocl_zen=yes for x86_64, but not for amd64_legacy. - Add "if defined(BLIS_FAMILY_X86_64)" to frame/3/bli_l3_sup.c and frame/3/bli_l3_sup_int_amd.c so zen-specific code paths are enabled. Note: sub-configurations knl and bulldozer use instructions that are not supported on most x86_64 processors. AMD-Internal: [CPUPL-3838] Change-Id: I0bd8fd89ccd846f80e5491ef44ade7d409970b04
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -32,10 +33,30 @@
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
#ifndef BLIS_FAMILY_H
|
||||
#define BLIS_FAMILY_H
|
||||
|
||||
// By default, it is effective to parallelize the outer loops.
|
||||
// Setting these macros to 1 will force JR and IR inner loops
|
||||
// to be not parallelized.
|
||||
//
|
||||
#define BLIS_THREAD_MAX_IR 1
|
||||
#define BLIS_THREAD_MAX_JR 1
|
||||
|
||||
#define BLIS_ENABLE_SMALL_MATRIX
|
||||
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
|
||||
//#endif
|
||||
// This will select the threshold below which small matrix code will be called.
|
||||
#define BLIS_SMALL_MATRIX_THRES 700
|
||||
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
|
||||
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
|
||||
|
||||
#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96
|
||||
#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128
|
||||
|
||||
// When running HPL with pure MPI without DGEMM threading (Single-threaded
|
||||
// BLIS), defining this macro as 1 yields better performance.
|
||||
#define AOCL_BLIS_MULTIINSTANCE 0
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#
|
||||
|
||||
# Processor families.
|
||||
x86_64: intel64 amd64 amd64_legacy
|
||||
x86_64: intel64 amdzen amd64_legacy
|
||||
intel64: skx knl haswell sandybridge penryn generic
|
||||
amd64_legacy: excavator steamroller piledriver bulldozer generic
|
||||
amdzen: zen4 zen3 zen2 zen generic
|
||||
|
||||
9
configure
vendored
9
configure
vendored
@@ -3332,10 +3332,11 @@ main()
|
||||
uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
|
||||
config_name_define="#define BLIS_FAMILY_${uconf}\n"
|
||||
|
||||
#create a AOCL specific #define
|
||||
#This macro is enabled only for zen family configurations.
|
||||
#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1)
|
||||
# Create a AOCL specific #define
|
||||
# This macro is enabled only for zen family configurations.
|
||||
# This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
# Note: amd64_legacy is for pre-zen architectures.
|
||||
uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1)
|
||||
if [[ $uconf == 1 ]]; then
|
||||
enable_aocl_zen='yes'
|
||||
enable_aocl_zen_01=1
|
||||
|
||||
@@ -107,7 +107,7 @@ err_t bli_gemmsup
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
|
||||
#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
|
||||
|
||||
if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
|
||||
{
|
||||
|
||||
@@ -134,7 +134,7 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
@@ -212,7 +212,7 @@ err_t bli_gemmsup_int
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
|
||||
#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
|
||||
|
||||
//Enable packing of B matrix for double data type when dims at per
|
||||
//thread level are above caches and enable packing of A when transA
|
||||
|
||||
Reference in New Issue
Block a user