From cf31fcd02077840dccbe2ec707a688b0004af80e Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Wed, 17 Aug 2022 17:17:43 +0530 Subject: [PATCH] Fine tuned threshold and aocl dynamic for zgemm for skinny matrices. -Updated optimal threads in zgemm sup path for skinny matrices. -Fine tuned the threshold values for small and sup paths to improve overall zgemm. -Zgemm small is selected for inputs with transb as N. -Redirection of input among small, sup and native path was fine tuned. AMD-Internal : [CPUPL-1900] Change-Id: Ide37c8255def770b4b74bc6e7c6edb5ee15d3b1f --- config/zen/bli_cntx_init_zen.c | 4 ++-- frame/base/bli_rntm.c | 19 +++++++++++++++++-- frame/compat/bla_gemm_amd.c | 6 ++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 3fea3ea8f..f527fe58d 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -231,9 +231,9 @@ void bli_cntx_init_zen( cntx_t* cntx ) // Initialize sup thresholds with architecture-appropriate values. // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 128 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 128 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 5efba4f2f..d6712b060 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -624,14 +624,29 @@ void bli_nthreads_optimum( dim_t n = bli_obj_width(c); dim_t k = bli_obj_width_after_trans(a); - if((m<=128 || n<=128 || k<=128) && ((m+n+k) <= 400) ) + if((m<=128 || n<=128 || k<=128) && ((m+n+k) <= 400)) { n_threads_ideal = 8; } - else if((m<=256 || n<=256 || k<=256) && ((m+n+k) <= 800) ) + else if((m<=256 || n<=256 || k<=256) && ((m+n+k) <= 800)) { n_threads_ideal = 16; } + if((m<=48) || (n<=48) || (k<=48)) + { + if((m+n+k) <= 840) + { + n_threads_ideal = 8; + } + else if((m+n+k) <= 1240) + { + n_threads_ideal = 16; + } + else if((m+n+k) <= 1540) + { + n_threads_ideal = 32; + } + } } else if( family == BLIS_SYRK && bli_obj_is_double(c)) { diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c index adc83f073..2a9dcb99d 100644 --- a/frame/compat/bla_gemm_amd.c +++ b/frame/compat/bla_gemm_amd.c @@ -762,7 +762,7 @@ void zgemm_ - For single thread, the API has no constraints before invoking. - For multiple threads, the constraint is that m and n should individually be less than 128. */ - if((k0==1) && ((nt==0) || ((nt==1) && (m0 < 128) && (n0 < 128))) + if((k0 == 1) && ((nt == 0) || ((nt == 1) && (m0 < 128) && (n0 < 128))) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb)) { @@ -853,9 +853,11 @@ void zgemm_ } #endif } + #ifdef BLIS_ENABLE_SMALL_MATRIX - if (((nt == 0) && (m0 <= 40) && (n0 <= 40) && (k0 <= 512)) || + if (((nt == 0) && (((m0 <= 40) && (n0 <= 40)) || + (m0 <= 128) && (n0 <= 128) && bli_is_notrans(blis_transb)) && (k0 <= 512)) || ((nt == 1) && (((m0 <= 32) || (n0 <= 32) || (k0 <= 32)) && ((m0 + n0 + k0) <= 100)))) { err_t status = BLIS_NOT_YET_IMPLEMENTED;