Revert changes to force dgemm inputs under threshold to arch-specific kernels in single-threaded mode

- This patch reverts the previous changes that removed the enforcement
  of dgemm inputs under a certain threshold to be processed by kernels
  selected based on architecture ID and handled in single-threaded mode.

- This change is now forcing such small inputs to be computed in tiny
  path. Previously when this check was not there, it was routing these
  inputs to SUP path and causing performance regression due to framework
  overhead.

AMD-Internal: [CPUPL-5927]
Change-Id: I4a4b21fdcf7c3ffaa09efa46ba12798eca0f10bb
This commit is contained in:
harsh dave
2025-01-23 04:13:27 -05:00
committed by Harsh Dave
parent 805bd10353
commit c5e842e8d3

View File

@@ -254,12 +254,62 @@ err_t bli_dgemm_tiny
)
{
// Query the architecture ID
arch_t id = bli_arch_query_id();
arch_t arch_id = bli_arch_query_id();
//for the below tiny sizes of matrix, we force it to be ST compute.
if(m <= 24 && n <= 24 && k <= 20)
{
switch (arch_id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_FAMILY_ZEN5) || defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_X86_64)
return bli_dgemm_tiny_24x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
transa,
transb,
m,
n,
k,
alpha,
a, rs_a0, cs_a0,
b, rs_b0, cs_b0,
beta,
c, rs_c0, cs_c0
);
#endif
break;
if(FALSE == bli_thread_get_is_parallel())
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
return bli_dgemm_tiny_6x8
(
1 * (transa == BLIS_CONJ_NO_TRANSPOSE),
1 * (transb == BLIS_CONJ_NO_TRANSPOSE),
transa,
transb,
m,
n,
k,
alpha,
a, rs_a0, cs_a0,
b, rs_b0, cs_b0,
beta,
c, rs_c0, cs_c0
);
break;
default:
return BLIS_FAILURE;
}
}
if( FALSE == bli_thread_get_is_parallel() )
{
// Pick the kernel based on the architecture ID
switch (id)
switch (arch_id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4: