From 42d05a5aa031bcf99763398bbf8f5e9dae339db7 Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Thu, 6 Apr 2023 16:08:25 +0000 Subject: [PATCH] DGEMM: Added decision logic to choose between sup vs native for zen4 architecture Details: - Added a new function for choosing between SUP and native implementation for a given size. - This function pointer is stored in cntx for zen4 config. - Divided total combinations of sizes into 3 categories: - one dimension is small - Two dimensions are small - All dimensions are small - Added different threshold conditions for each of the categories. AMD-Internal: [CPUPL-2755] Change-Id: Iae4bf96bb7c9bf9f68fd909fb757d7fe13bc6caf --- config/zen4/bli_cntx_init_zen4.c | 8 ++- config/zen4/bli_family_zen4.h | 2 +- frame/3/bli_l3_sup_int_amd.c | 12 ++-- kernels/zen4/CMakeLists.txt | 2 +- kernels/zen4/aocl_smart/CMakeLists.txt | 6 ++ kernels/zen4/aocl_smart/bli_aocl_smart.c | 71 ++++++++++++++++++++++++ kernels/zen4/bli_kernels_zen4.h | 11 +++- 7 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 kernels/zen4/aocl_smart/CMakeLists.txt create mode 100644 kernels/zen4/aocl_smart/bli_aocl_smart.c diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index 4003e179c..470c5e3c2 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -90,7 +90,9 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // Update the context with architecture specific threshold functions bli_cntx_set_l3_thresh_funcs ( - 2, + 3, + // GEMM + BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4, // GEMMT BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen, // SYRK @@ -216,8 +218,8 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 1000, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 1000, 256, 128 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); // Initialize the context with the sup thresholds. diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index d3a92b88c..263e2b695 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c index 12f33ebd9..029c383dc 100644 --- a/frame/3/bli_l3_sup_int_amd.c +++ b/frame/3/bli_l3_sup_int_amd.c @@ -134,10 +134,12 @@ err_t bli_gemmsup_int } } +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) + //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA //(RRC or CRC storage ids) to avoid rd kernels - if(bli_is_double(dt)) + if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3)) { dim_t m_pt = (m/bli_rntm_ways_for( BLIS_MC, rntm )); dim_t n_pt = (n/bli_rntm_ways_for( BLIS_NC, rntm )); @@ -153,7 +155,7 @@ err_t bli_gemmsup_int } } } - +#endif // Using the 1n kernel (B broadcast) gave better performance for sgemm // in single-thread scenario, given the number of n panels are // sufficiently larger than m panels. @@ -210,10 +212,12 @@ err_t bli_gemmsup_int } } +#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN) + //Enable packing of B matrix for double data type when dims at per //thread level are above caches and enable packing of A when transA //(RRC or CRC storage ids) to avoid rd kernels - if(bli_is_double(dt)) + if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3)) { dim_t m_pt = (m/bli_rntm_ways_for( BLIS_NC, rntm )); dim_t n_pt = (n/bli_rntm_ways_for( BLIS_MC, rntm )); @@ -229,7 +233,7 @@ err_t bli_gemmsup_int } } } - +#endif if ( bli_is_float( dt ) && ( n_threads == 1 ) && ( use_pb == TRUE ) ) { bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, diff --git a/kernels/zen4/CMakeLists.txt b/kernels/zen4/CMakeLists.txt index e2093ed82..b6bd12d44 100644 --- a/kernels/zen4/CMakeLists.txt +++ b/kernels/zen4/CMakeLists.txt @@ -3,5 +3,5 @@ add_subdirectory(1) add_subdirectory(1m) add_subdirectory(3) - +add_subdirectory(aocl_smart) diff --git a/kernels/zen4/aocl_smart/CMakeLists.txt b/kernels/zen4/aocl_smart/CMakeLists.txt new file mode 100644 index 000000000..ef10975d2 --- /dev/null +++ b/kernels/zen4/aocl_smart/CMakeLists.txt @@ -0,0 +1,6 @@ +##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.## + +target_sources("${PROJECT_NAME}" + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/bli_aocl_smart.c + ) diff --git a/kernels/zen4/aocl_smart/bli_aocl_smart.c b/kernels/zen4/aocl_smart/bli_aocl_smart.c new file mode 100644 index 000000000..96e45b713 --- /dev/null +++ b/kernels/zen4/aocl_smart/bli_aocl_smart.c @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +/* This function determines if we need to take SUP or native path + for given matrix sizes for zen4 configuration. + * Returns TRUE if the dimensions fall under SUP range + * Returns FALSE if the dimensions fall under Native range +*/ +bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ) +{ + num_t dt = bli_obj_dt( c ); + + if( dt == BLIS_DOUBLE ) + { + dim_t k = bli_obj_width_after_trans( a ); + dim_t m, n; + + const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); + + if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) ) + { + m = bli_obj_width(c); + n = bli_obj_length(c); + } + else + { + m = bli_obj_length( c ); + n = bli_obj_width( c ); + } + // For skinny sizes where one/two dimensions are small + if((m < 1000) || (n < 1000)) return TRUE; + // For all combinations in small sizes + if((m < 5000) && (n < 5000) && (k < 5000)) return TRUE; + return FALSE; + } + else + return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx ); +} diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h index e09d4bba4..701e2ecb4 100644 --- a/kernels/zen4/bli_kernels_zen4.h +++ b/kernels/zen4/bli_kernels_zen4.h @@ -197,4 +197,13 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x1 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x3 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x2 ) -GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 ) \ No newline at end of file +GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_2x1 ) + +// threshold functions +bool bli_cntx_gemmsup_thresh_is_met_zen4 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + );