mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
Partial completion of work in L1 APIs
- Partial completion of compute was happening since BLIS was unable to launch the required number of threads. This was because rntm was returning a thread count greater than the maximum number of threads that can be launched in the subsequent parallel region. - Added 'omp_get_num_threads' inside the parallel regions to get the actual number of threads spawned. The work distribution happens based on the actual number of threads launched in that region. AMD-Internal: [CPUPL-3268] Change-Id: I086ad4b9b644f966b7bab439e43222396f0c2bf0
This commit is contained in:
@@ -398,11 +398,17 @@ void daxpy_blis_impl
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
|
||||
// Calculate the compute range for the current thread
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n_elem,
|
||||
nt,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
|
||||
@@ -466,11 +466,17 @@ double ddot_blis_impl
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
|
||||
// Calculate the compute range for the current thread
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n_elem,
|
||||
nt,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
@@ -492,13 +498,18 @@ double ddot_blis_impl
|
||||
);
|
||||
}
|
||||
|
||||
// Accumulating the nt thread outputs to rho
|
||||
for ( dim_t i = 0; i < nt; i++ )
|
||||
rho += rho_temp[i];
|
||||
|
||||
// Releasing the allocated memory if it was allocated
|
||||
if( bli_mem_is_alloc(&mem_buf_rho))
|
||||
/*
|
||||
Accumulate the values in rho_temp only when mem is allocated.
|
||||
When the memory cannot be allocated rho_temp will point to
|
||||
rho
|
||||
*/
|
||||
if (bli_mem_is_alloc(&mem_buf_rho))
|
||||
{
|
||||
// Accumulating the nt thread outputs to rho
|
||||
for (dim_t i = 0; i < nt; i++)
|
||||
rho += rho_temp[i];
|
||||
|
||||
// Releasing the allocated memory if it was allocated
|
||||
bli_membrk_release(&rntm, &mem_buf_rho);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -383,11 +383,17 @@ void dscal_blis_impl
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
|
||||
// Calculate the compute range for the current thread
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n_elem,
|
||||
nt,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
@@ -563,11 +569,17 @@ void zdscal_blis_impl
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
|
||||
// Calculate the compute range for the current thread
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n_elem,
|
||||
nt,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user