AOCL Dynamic Optimization for DGEMMT

- Fine-tuned the thread allocation logic for parallelizing DGEMMT for the cases where n <= 220. This results in performance improvement in multi-threaded DGEMMT for small values of n.

AMD-Internal: [CPUPL-2215]
Change-Id: I2654bc64d2dc43c2db911e0c9175755be3aa8ba5
This commit is contained in:
Arnav Sharma
2022-07-13 11:42:35 +05:30
committed by Arnav Sharma
parent 2ad25a7180
commit 4f96bb712e

View File

@@ -680,16 +680,35 @@ void bli_nthreads_optimum(
dim_t n = bli_obj_length(c);
dim_t k = bli_obj_width_after_trans(a);
if ( n < 32 )
if ( n < 8 )
{
if ( k <= 512)
{
n_threads_ideal = 1;
}
else if ( k <= 1024 )
{
n_threads_ideal = 4;
}
}
else if ( n < 32 )
{
if ( k < 128 )
{
n_threads_ideal = 1;
}
else if ( k == 128 )
else if ( k <= 512 )
{
n_threads_ideal = 4;
}
else if ( k <= 1024 )
{
n_threads_ideal = 6;
}
else if ( k <= 1600 )
{
n_threads_ideal = 10;
}
}
else if ( n <= 40 )
{
@@ -724,6 +743,17 @@ void bli_nthreads_optimum(
n_threads_ideal = 8;
}
}
else if ( n < 176 )
{
if ( k < 128 )
{
n_threads_ideal = 8;
}
else if ( k <= 512 )
{
n_threads_ideal = 14;
}
}
else if ( n <= 220 )
{
if ( k < 128 )