From 828ac8e2dda7c331ffc2479f957b4974f6cc8a30 Mon Sep 17 00:00:00 2001 From: Harihara Sudhan S Date: Wed, 26 Apr 2023 18:05:56 +0530 Subject: [PATCH] Partial completion of work in L1 APIs - Partial completion of compute was happening since BLIS was unable to launch the required number of threads. This was because rntm was returning a thread count greater than the maximum number of threads that can be launched in the subsequent parallel region. - Added 'omp_get_num_threads' inside the parallel regions to get the actual number of threads spawned. The work distribution happens based on the actual number of threads launched in that region. AMD-Internal: [CPUPL-3268] Change-Id: I086ad4b9b644f966b7bab439e43222396f0c2bf0 --- frame/compat/bla_axpy_amd.c | 10 ++++++++-- frame/compat/bla_dot_amd.c | 27 +++++++++++++++++++-------- frame/compat/bla_scal_amd.c | 20 ++++++++++++++++---- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/frame/compat/bla_axpy_amd.c b/frame/compat/bla_axpy_amd.c index f6a64c40a..0e24d7d4a 100644 --- a/frame/compat/bla_axpy_amd.c +++ b/frame/compat/bla_axpy_amd.c @@ -398,11 +398,17 @@ void daxpy_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c index 3d0648d27..213fd14a4 100644 --- a/frame/compat/bla_dot_amd.c +++ b/frame/compat/bla_dot_amd.c @@ -466,11 +466,17 @@ double ddot_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); @@ -492,13 +498,18 @@ double ddot_blis_impl ); } - // Accumulating the nt thread outputs to rho - for ( dim_t i = 0; i < nt; i++ ) - rho += rho_temp[i]; - - // Releasing the allocated memory if it was allocated - if( bli_mem_is_alloc(&mem_buf_rho)) + /* + Accumulate the values in rho_temp only when mem is allocated. + When the memory cannot be allocated rho_temp will point to + rho + */ + if (bli_mem_is_alloc(&mem_buf_rho)) { + // Accumulating the nt thread outputs to rho + for (dim_t i = 0; i < nt; i++) + rho += rho_temp[i]; + + // Releasing the allocated memory if it was allocated bli_membrk_release(&rntm, &mem_buf_rho); } #endif diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index bec3515a0..041c1b6a8 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -383,11 +383,17 @@ void dscal_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id ); @@ -563,11 +569,17 @@ void zdscal_blis_impl // Get the thread ID dim_t thread_id = omp_get_thread_num(); - // Calculate the compute range for the current thread + // Get the actual number of threads spawned + dim_t nt_use = omp_get_num_threads(); + + /* + Calculate the compute range for the current thread + based on the actual number of threads spawned + */ bli_thread_vector_partition ( n_elem, - nt, + nt_use, &start, &length, thread_id );