Added aocl dynamic feature for dtrsm for small sizes

Details:
1. Added aocl-dynamic for dtrsm native path
   When (m,n)<512 better performance observed for nthreads=4
2. Updated trsm_small threshold such that when (m+n)<320
   trsm_small is doing better than native irrespective of
   number of threads

Change-Id: Ic2c50f14db257a05e323cc97c5d1c9b73b68f487
This commit is contained in:
Nallani Bhaskar
2021-06-17 23:53:12 +05:30
parent d7377f967c
commit 75f72b7f6e
3 changed files with 16 additions and 1 deletions

View File

@@ -126,6 +126,11 @@ void bli_trsm_front
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
#ifdef AOCL_DYNAMIC
// If dynamic-threading is enabled, calculate optimum number
// of threads and update in rntm
bli_nthreads_optimum(a, b, b, BLIS_TRSM, rntm );
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.

View File

@@ -556,6 +556,16 @@ void bli_nthreads_optimum(
else
n_threads_ideal = n_threads;
}
else if( family == BLIS_TRSM && bli_obj_is_double(c))
{
dim_t m = bli_obj_length(c);
dim_t n = bli_obj_width(c);
if(m<=512 && n<=512)
n_threads_ideal = 4;
}
dim_t n_threads_opt = bli_min(n_threads, n_threads_ideal);
bli_pthread_mutex_lock( &global_rntm_mutex );

View File

@@ -602,7 +602,7 @@ void dtrsm_
* is doing better than native multithread */
bool nt = bli_thread_get_is_parallel();
if((nt==0 && m0<=1000 && n0<=1000) ||
(nt && m0<=128 && n0<=128 ) )
(nt && (m0+n0)<320) )
{
err_t status;
status = bli_trsm_small