mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Fix for DDOT Multi-thread implementation
- In the existing implementation, when the actual number of threads spawned is different from the indicated number of threads for the parallel region, partial job is being executed. - Fix added to identify actual number of threads spawned and allocate the work load to single thread in case of discrepancy in the number of threads spawned vs indicated. AMD-Internal: [SWLCSG-1659] Change-Id: I882d0af608009e502189a4469eb3483d659c2b3f
This commit is contained in:
committed by
Arnav Sharma
parent
11c42ce1d3
commit
69be3b0557
@@ -300,6 +300,7 @@ double ddot_blis_impl
|
||||
rntm_t rntm;
|
||||
double* rho_temp = NULL;
|
||||
dim_t nt, n0_per_thread, n0_rem, nt_pred;
|
||||
dim_t i;
|
||||
rho = 0;
|
||||
// Initialize a local runtime with global settings.
|
||||
bli_rntm_init_from_global(&rntm);
|
||||
@@ -324,6 +325,13 @@ double ddot_blis_impl
|
||||
return BLIS_NULL_POINTER;
|
||||
rho_temp = bli_mem_buffer(&local_mem_buf);
|
||||
if ( NULL == rho_temp ) return BLIS_NULL_POINTER;
|
||||
|
||||
// Initializing rho_temp array to 0
|
||||
for ( i = 0; i < nt; i++ )
|
||||
{
|
||||
rho_temp[i] = 0;
|
||||
}
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
// Calculate the optimal number of threads required
|
||||
// based on input dimension. These conditions are taken
|
||||
@@ -340,43 +348,72 @@ double ddot_blis_impl
|
||||
n0_per_thread = n0 / nt;
|
||||
n0_rem = n0 % nt;
|
||||
|
||||
// Multithreading Implementation
|
||||
#pragma omp parallel num_threads(nt)
|
||||
{
|
||||
// Getting the actual number of threads that are spawned.
|
||||
dim_t nt_real = omp_get_num_threads();
|
||||
dim_t t_id = omp_get_thread_num();
|
||||
// The following conditions handle the optimal distribution of
|
||||
// load among the threads.
|
||||
// Say we have n0 = 50 & nt = 4.
|
||||
// So we get 12 ( n0 / nt ) elements per thread along with 2
|
||||
// remaining elements. Each of these remaining elements is given
|
||||
// to the last threads, respectively.
|
||||
// So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements,
|
||||
// respectively.
|
||||
dim_t npt, offset;
|
||||
if ( t_id < ( nt - n0_rem ) )
|
||||
|
||||
// The actual number of threads spawned might be different
|
||||
// from the predicted number of threads for which this parallel
|
||||
// region is being generated. Thus, in such a case we are
|
||||
// falling back to the Single-Threaded call.
|
||||
if ( nt_real != nt )
|
||||
{
|
||||
npt = n0_per_thread;
|
||||
offset = t_id * npt;
|
||||
// More than one thread can still be spawned but since we
|
||||
// are falling back to the ST call, we are
|
||||
// calling the kernel from thread 0 only.
|
||||
if ( t_id == 0 )
|
||||
{
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
rho_temp,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
npt = n0_per_thread + 1;
|
||||
offset = ( ( t_id * n0_per_thread ) +
|
||||
( t_id - ( nt - n0_rem ) ) );
|
||||
// The following conditions handle the optimal distribution
|
||||
// of load among the threads.
|
||||
// Say we have n0 = 50 & nt = 4.
|
||||
// So we get 12 ( n0 / nt ) elements per thread along with 2
|
||||
// remaining elements. Each of these remaining elements is
|
||||
// given to the last threads, respectively.
|
||||
// So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements,
|
||||
// respectively.
|
||||
dim_t npt, offset;
|
||||
if ( t_id < ( nt - n0_rem ) )
|
||||
{
|
||||
npt = n0_per_thread;
|
||||
offset = t_id * npt;
|
||||
}
|
||||
else
|
||||
{
|
||||
npt = n0_per_thread + 1;
|
||||
offset = ( ( t_id * n0_per_thread ) +
|
||||
( t_id - ( nt - n0_rem ) ) );
|
||||
}
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
npt,
|
||||
x0 + ( offset * incx0 ), incx0,
|
||||
y0 + ( offset * incy0 ), incy0,
|
||||
rho_temp + t_id,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
npt,
|
||||
x0 + ( offset * incx0 ), incx0,
|
||||
y0 + ( offset * incy0 ), incy0,
|
||||
rho_temp + t_id,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
// Accumulating the nt threads output
|
||||
for ( int i = 0; i < nt; i++ )
|
||||
|
||||
// Accumulating the nt thread outputs to rho
|
||||
for ( i = 0; i < nt; i++ )
|
||||
rho += rho_temp[i];
|
||||
|
||||
// Releasing the allocated memory
|
||||
|
||||
Reference in New Issue
Block a user