From 69be3b055779e8fe482146a8c5cd736e382d4e57 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Fri, 18 Nov 2022 16:27:13 +0530 Subject: [PATCH] Fix for DDOT Multi-thread implementation - In the existing implementation, when the actual number of threads spawned is different from the indicated number of threads for the parallel region, partial job is being executed. - Fix added to identify actual number of threads spawned and allocate the work load to single thread in case of discrepancy in the number of threads spawned vs indicated. AMD-Internal: [SWLCSG-1659] Change-Id: I882d0af608009e502189a4469eb3483d659c2b3f --- frame/compat/bla_dot_amd.c | 93 ++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c index b7d397eca..e9536de3c 100644 --- a/frame/compat/bla_dot_amd.c +++ b/frame/compat/bla_dot_amd.c @@ -300,6 +300,7 @@ double ddot_blis_impl rntm_t rntm; double* rho_temp = NULL; dim_t nt, n0_per_thread, n0_rem, nt_pred; + dim_t i; rho = 0; // Initialize a local runtime with global settings. bli_rntm_init_from_global(&rntm); @@ -324,6 +325,13 @@ double ddot_blis_impl return BLIS_NULL_POINTER; rho_temp = bli_mem_buffer(&local_mem_buf); if ( NULL == rho_temp ) return BLIS_NULL_POINTER; + + // Initializing rho_temp array to 0 + for ( i = 0; i < nt; i++ ) + { + rho_temp[i] = 0; + } + #ifdef AOCL_DYNAMIC // Calculate the optimal number of threads required // based on input dimension. These conditions are taken @@ -340,43 +348,72 @@ double ddot_blis_impl n0_per_thread = n0 / nt; n0_rem = n0 % nt; - // Multithreading Implementation #pragma omp parallel num_threads(nt) { + // Getting the actual number of threads that are spawned. + dim_t nt_real = omp_get_num_threads(); dim_t t_id = omp_get_thread_num(); - // The following conditions handle the optimal distribution of - // load among the threads. - // Say we have n0 = 50 & nt = 4. - // So we get 12 ( n0 / nt ) elements per thread along with 2 - // remaining elements. Each of these remaining elements is given - // to the last threads, respectively. - // So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements, - // respectively. - dim_t npt, offset; - if ( t_id < ( nt - n0_rem ) ) + + // The actual number of threads spawned might be different + // from the predicted number of threads for which this parallel + // region is being generated. Thus, in such a case we are + // falling back to the Single-Threaded call. + if ( nt_real != nt ) { - npt = n0_per_thread; - offset = t_id * npt; + // More than one thread can still be spawned but since we + // are falling back to the ST call, we are + // calling the kernel from thread 0 only. + if ( t_id == 0 ) + { + bli_ddotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + rho_temp, + NULL + ); + } } else { - npt = n0_per_thread + 1; - offset = ( ( t_id * n0_per_thread ) + - ( t_id - ( nt - n0_rem ) ) ); + // The following conditions handle the optimal distribution + // of load among the threads. + // Say we have n0 = 50 & nt = 4. + // So we get 12 ( n0 / nt ) elements per thread along with 2 + // remaining elements. Each of these remaining elements is + // given to the last threads, respectively. + // So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements, + // respectively. + dim_t npt, offset; + if ( t_id < ( nt - n0_rem ) ) + { + npt = n0_per_thread; + offset = t_id * npt; + } + else + { + npt = n0_per_thread + 1; + offset = ( ( t_id * n0_per_thread ) + + ( t_id - ( nt - n0_rem ) ) ); + } + bli_ddotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + npt, + x0 + ( offset * incx0 ), incx0, + y0 + ( offset * incy0 ), incy0, + rho_temp + t_id, + NULL + ); } - bli_ddotv_zen_int10 - ( - BLIS_NO_CONJUGATE, - BLIS_NO_CONJUGATE, - npt, - x0 + ( offset * incx0 ), incx0, - y0 + ( offset * incy0 ), incy0, - rho_temp + t_id, - NULL - ); } - // Accumulating the nt threads output - for ( int i = 0; i < nt; i++ ) + + // Accumulating the nt thread outputs to rho + for ( i = 0; i < nt; i++ ) rho += rho_temp[i]; // Releasing the allocated memory