From d683c224e8f5caf24b87fa406191eff35e2b0223 Mon Sep 17 00:00:00 2001 From: mkadavil Date: Mon, 11 Oct 2021 17:51:23 +0530 Subject: [PATCH] Workaround for perf regression observed for sgemm Details: - Perf regression is observed for certain m,n,k inputs where (m,n,k > 512) and (m > 4 * n) in BLIS 3.1. The root cause was traced to commit 11dfc176a3c422729f453f6c23204cf023e9954d where BLIS_THREAD_RATIO_M was updated from 2 to 1. This change was not part of BLIS 3.0.6 and hence resulted in the new perf drop in 3.1. - This workaround updates the m dimension (doubles it) that is passed as argument to bli_rntm_set_ways_for_op which is used to determine the ic,jc work split in the threads. The BLIS_THREAD_RATIO_M is not updated (to 2) and rather the effect is induced using the doubled m dimension. AMD-Internal: [CPUPL-1909] Change-Id: I3b6ec4d4a22154289cb56d8f7db4cb60e5f34afe --- frame/3/gemm/bli_gemm_front.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 662a6da9b..a065156bb 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -173,7 +173,24 @@ void bli_gemm_front // or the inlined code above. bli_obj_swap_pack_schemas( &a_local, &b_local ); } - + + dim_t m_dim_local = bli_obj_length( &c_local ); + dim_t n_dim_local = bli_obj_width( &c_local ); + dim_t k_dim_local = bli_obj_width( &a_local ); +#ifdef BLIS_CONFIG_EPYC + // Regression observed in sgemm native path in cases where m >= 4 * n + // after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit + // 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for + // the issue. + if( bli_obj_is_float( &c_local ) && + ( n_dim_local >= 1024 ) && + ( k_dim_local >= 1024 ) && + ( m_dim_local >= ( 4 * n_dim_local ) ) ) + { + m_dim_local *= 2; + } +#endif + // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -181,9 +198,9 @@ void bli_gemm_front ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), + m_dim_local, + n_dim_local, + k_dim_local, rntm );