Partial completion of work in L1 APIs

- Partial completion of compute was happening since BLIS was unable
  to launch the required number of threads. This was because rntm
  was returning a thread count greater than the maximum number of
  threads that can be launched in the subsequent parallel region.
- Added 'omp_get_num_threads' inside the parallel regions to get the
  actual number of threads spawned. The work distribution happens
  based on the actual number of threads launched in that region.

AMD-Internal: [CPUPL-3268]
Change-Id: I086ad4b9b644f966b7bab439e43222396f0c2bf0
This commit is contained in:
Harihara Sudhan S
2023-04-26 18:05:56 +05:30
parent 7e50ba669b
commit 828ac8e2dd
3 changed files with 43 additions and 14 deletions

View File

@@ -398,11 +398,17 @@ void daxpy_blis_impl
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Calculate the compute range for the current thread
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n_elem,
nt,
nt_use,
&start, &length,
thread_id
);

View File

@@ -466,11 +466,17 @@ double ddot_blis_impl
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Calculate the compute range for the current thread
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n_elem,
nt,
nt_use,
&start, &length,
thread_id
);
@@ -492,13 +498,18 @@ double ddot_blis_impl
);
}
// Accumulating the nt thread outputs to rho
for ( dim_t i = 0; i < nt; i++ )
rho += rho_temp[i];
// Releasing the allocated memory if it was allocated
if( bli_mem_is_alloc(&mem_buf_rho))
/*
Accumulate the values in rho_temp only when mem is allocated.
When the memory cannot be allocated rho_temp will point to
rho
*/
if (bli_mem_is_alloc(&mem_buf_rho))
{
// Accumulating the nt thread outputs to rho
for (dim_t i = 0; i < nt; i++)
rho += rho_temp[i];
// Releasing the allocated memory if it was allocated
bli_membrk_release(&rntm, &mem_buf_rho);
}
#endif

View File

@@ -383,11 +383,17 @@ void dscal_blis_impl
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Calculate the compute range for the current thread
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n_elem,
nt,
nt_use,
&start, &length,
thread_id
);
@@ -563,11 +569,17 @@ void zdscal_blis_impl
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Calculate the compute range for the current thread
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n_elem,
nt,
nt_use,
&start, &length,
thread_id
);