From 85f2bf6c4adc0d7e339b45bb7988294c278af88c Mon Sep 17 00:00:00 2001 From: Edward Smyth Date: Wed, 4 Oct 2023 00:18:57 +0530 Subject: [PATCH] Fix for x86_64 builds Configuration x86_64 includes all Intel and AMD sub-configurations. Fixes to enable this to work correctly again are: - In config_registry use amdzen rather than amd64 in x86_64 family. - Copy settings from config/amdzen/bli_family_amdzen.h to config/x86_64/bli_family_x86_64.h - Modify configure to set enable_aocl_zen=yes for x86_64, but not for amd64_legacy. - Add "if defined(BLIS_FAMILY_X86_64)" to frame/3/bli_l3_sup.c and frame/3/bli_l3_sup_int_amd.c so zen-specific code paths are enabled. Note: sub-configurations knl and bulldozer use instructions that are not supported on most x86_64 processors. AMD-Internal: [CPUPL-3838] Change-Id: I0bd8fd89ccd846f80e5491ef44ade7d409970b04 --- config/x86_64/bli_family_x86_64.h | 27 ++++++++++++++++++++++++--- config_registry | 2 +- configure | 9 +++++---- frame/3/bli_l3_sup.c | 2 +- frame/3/bli_l3_sup_int_amd.c | 4 ++-- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/config/x86_64/bli_family_x86_64.h b/config/x86_64/bli_family_x86_64.h index 21b44db87..c327a0b19 100644 --- a/config/x86_64/bli_family_x86_64.h +++ b/config/x86_64/bli_family_x86_64.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,30 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#ifndef BLIS_FAMILY_H +#define BLIS_FAMILY_H +// By default, it is effective to parallelize the outer loops. +// Setting these macros to 1 will force JR and IR inner loops +// to be not parallelized. +// +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 +#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX_TRSM -//#endif +// This will select the threshold below which small matrix code will be called. +#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 +#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 + +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 + +// When running HPL with pure MPI without DGEMM threading (Single-threaded +// BLIS), defining this macro as 1 yields better performance. +#define AOCL_BLIS_MULTIINSTANCE 0 + +#endif diff --git a/config_registry b/config_registry index 4e6716dfa..8a3a47bfb 100644 --- a/config_registry +++ b/config_registry @@ -8,7 +8,7 @@ # # Processor families. -x86_64: intel64 amd64 amd64_legacy +x86_64: intel64 amdzen amd64_legacy intel64: skx knl haswell sandybridge penryn generic amd64_legacy: excavator steamroller piledriver bulldozer generic amdzen: zen4 zen3 zen2 zen generic diff --git a/configure b/configure index a165c1ad5..96a803504 100755 --- a/configure +++ b/configure @@ -3332,10 +3332,11 @@ main() uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" - #create a AOCL specific #define - #This macro is enabled only for zen family configurations. - #This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. - uconf=$(echo ${config_name} | grep -c 'zen\|amd64' | cut -d. -f1) + # Create a AOCL specific #define + # This macro is enabled only for zen family configurations. + # This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes. + # Note: amd64_legacy is for pre-zen architectures. + uconf=$(echo ${config_name} | grep -v amd64_legacy |grep -c 'zen\|amd64\|x86_64' | cut -d. -f1) if [[ $uconf == 1 ]]; then enable_aocl_zen='yes' enable_aocl_zen_01=1 diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index afd74d2ee..317956ba4 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -107,7 +107,7 @@ err_t bli_gemmsup if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } -#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) if((bli_arch_query_id() == BLIS_ARCH_ZEN4)) { diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 69b691674..2664f48bf 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -134,7 +134,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA @@ -212,7 +212,7 @@ err_t bli_gemmsup_int } } -#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64) //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA