diff --git a/config/x86_64/bli_family_x86_64.h b/config/x86_64/bli_family_x86_64.h index 21b44db87..c327a0b19 100644 --- a/config/x86_64/bli_family_x86_64.h +++ b/config/x86_64/bli_family_x86_64.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,30 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#ifndef BLIS_FAMILY_H +#define BLIS_FAMILY_H +// By default, it is effective to parallelize the outer loops. +// Setting these macros to 1 will force JR and IR inner loops +// to be not parallelized. +// +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 +#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX_TRSM -//#endif +// This will select the threshold below which small matrix code will be called. +#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 +#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 + +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 + +// When running HPL with pure MPI without DGEMM threading (Single-threaded +// BLIS), defining this macro as 1 yields better performance. +#define AOCL_BLIS_MULTIINSTANCE 0 + +#endif diff --git a/config_registry b/config_registry index 4e6716dfa..8a3a47bfb 100644 --- a/config_registry +++ b/config_registry @@ -8,7 +8,7 @@ # # Processor families. -x86_64: intel64 amd64 amd64_legacy +x86_64: intel64 amdzen amd64_legacy intel64: skx knl haswell sandybridge penryn generic amd64_legacy: excavator steamroller piledriver bulldozer generic amdzen: zen4 zen3 zen2 zen generic diff --git a/configure b/configure index a165c1ad5..96a803504 100755 --- a/configure +++ b/configure @@ -3332,10 +3332,11 @@ main() uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" - #create a AOCL specific #define - #This macro is enabled only for zen family configurations. - #This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. - uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1) + # Create a AOCL specific #define + # This macro is enabled only for zen family configurations. + # This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. + # Note: amd64_legacy is for pre-zen architectures. + uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1) if [[ $uconf == 1 ]]; then enable_aocl_zen='yes' enable_aocl_zen_01=1 diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index afd74d2ee..317956ba4 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -107,7 +107,7 @@ err_t bli_gemmsup if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } -#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) if((bli_arch_query_id() == BLIS_ARCH_ZEN4)) { diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 69b691674..2664f48bf 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -134,7 +134,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA @@ -212,7 +212,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA