Workaround for perf regression observed for sgemm

Details: - Perf regression is observed for certain m,n,k inputs where (m,n,k > 512) and (m > 4 * n) in BLIS 3.1. The root cause was traced to commit 11dfc176a3 where BLIS_THREAD_RATIO_M was updated from 2 to 1. This change was not part of BLIS 3.0.6 and hence resulted in the new perf drop in 3.1. - This workaround updates the m dimension (doubles it) that is passed as argument to bli_rntm_set_ways_for_op which is used to determine the ic,jc work split in the threads. The BLIS_THREAD_RATIO_M is not updated (to 2) and rather the effect is induced using the doubled m dimension. AMD-Internal: [CPUPL-1909] Change-Id: I3b6ec4d4a22154289cb56d8f7db4cb60e5f34afe
2026-05-13 02:25:39 +00:00 · 2021-10-11 17:51:23 +05:30
parent 30038af896
commit d683c224e8
1 changed files with 21 additions and 4 deletions
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -173,7 +173,24 @@ void bli_gemm_front
 		// or the inlined code above.
 		bli_obj_swap_pack_schemas( &a_local, &b_local );
 	}
-
+	
+	dim_t m_dim_local = bli_obj_length( &c_local );
+	dim_t n_dim_local = bli_obj_width( &c_local );
+	dim_t k_dim_local = bli_obj_width( &a_local );
+#ifdef BLIS_CONFIG_EPYC
+	// Regression observed in sgemm native path in cases where m >= 4 * n 
+	// after BLIS_THREAD_RATIO_M updated from 2 to 1 as part of commit 
+	// 11dfc176a3c422729f453f6c23204cf023e9954d. Temporary workaround for
+	// the issue.
+	if( bli_obj_is_float( &c_local ) &&
+	    ( n_dim_local >= 1024 ) &&
+	    ( k_dim_local >= 1024 ) &&
+	    ( m_dim_local >= ( 4 * n_dim_local ) ) )
+	{
+		m_dim_local *= 2;
+	}
+#endif
+	
 	// Parse and interpret the contents of the rntm_t object to properly
 	// set the ways of parallelism for each loop, and then make any
 	// additional modifications necessary for the current operation.
@@ -181,9 +198,9 @@ void bli_gemm_front
 	(
 	  BLIS_GEMM,
 	  BLIS_LEFT, // ignored for gemm/hemm/symm
-	  bli_obj_length( &c_local ),
-	  bli_obj_width( &c_local ),
-	  bli_obj_width( &a_local ),
+	  m_dim_local,
+	  n_dim_local,
+	  k_dim_local,
 	  rntm
 	);