Fine tuned threshold and aocl dynamic for zgemm for skinny matrices.

-Updated optimal threads in zgemm sup path for skinny matrices.
-Fine tuned the threshold values for small and sup paths
 to improve overall zgemm.
-Zgemm small is selected for inputs with transb as N.
-Redirection of input among small, sup and native path
 was fine tuned.

AMD-Internal : [CPUPL-1900]

Change-Id: Ide37c8255def770b4b74bc6e7c6edb5ee15d3b1f
This commit is contained in:
Vignesh Balasubramanian
2022-08-17 17:17:43 +05:30
parent 32c9239c7f
commit cf31fcd020
3 changed files with 23 additions and 6 deletions

View File

@@ -231,9 +231,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 128 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 128 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh

View File

@@ -624,14 +624,29 @@ void bli_nthreads_optimum(
dim_t n = bli_obj_width(c);
dim_t k = bli_obj_width_after_trans(a);
if((m<=128 || n<=128 || k<=128) && ((m+n+k) <= 400) )
if((m<=128 || n<=128 || k<=128) && ((m+n+k) <= 400))
{
n_threads_ideal = 8;
}
else if((m<=256 || n<=256 || k<=256) && ((m+n+k) <= 800) )
else if((m<=256 || n<=256 || k<=256) && ((m+n+k) <= 800))
{
n_threads_ideal = 16;
}
if((m<=48) || (n<=48) || (k<=48))
{
if((m+n+k) <= 840)
{
n_threads_ideal = 8;
}
else if((m+n+k) <= 1240)
{
n_threads_ideal = 16;
}
else if((m+n+k) <= 1540)
{
n_threads_ideal = 32;
}
}
}
else if( family == BLIS_SYRK && bli_obj_is_double(c))
{

View File

@@ -762,7 +762,7 @@ void zgemm_
- For single thread, the API has no constraints before invoking.
- For multiple threads, the constraint is that m and n should individually be less than 128.
*/
if((k0==1) && ((nt==0) || ((nt==1) && (m0 < 128) && (n0 < 128)))
if((k0 == 1) && ((nt == 0) || ((nt == 1) && (m0 < 128) && (n0 < 128)))
&& bli_is_notrans(blis_transa)
&& bli_is_notrans(blis_transb))
{
@@ -853,9 +853,11 @@ void zgemm_
}
#endif
}
#ifdef BLIS_ENABLE_SMALL_MATRIX
if (((nt == 0) && (m0 <= 40) && (n0 <= 40) && (k0 <= 512)) ||
if (((nt == 0) && (((m0 <= 40) && (n0 <= 40)) ||
(m0 <= 128) && (n0 <= 128) && bli_is_notrans(blis_transb)) && (k0 <= 512)) ||
((nt == 1) && (((m0 <= 32) || (n0 <= 32) || (k0 <= 32)) && ((m0 + n0 + k0) <= 100))))
{
err_t status = BLIS_NOT_YET_IMPLEMENTED;