From 2bc2d11e8aa21cdb2361d0e28c0c4e5b85210ba7 Mon Sep 17 00:00:00 2001 From: Arnav Sharma Date: Mon, 21 Nov 2022 13:16:44 +0530 Subject: [PATCH] Fix for DSCAL Multi-thread implementation - In the existing implementation, when the actual number of threads spawned is different from the indicated number of threads for the parallel region, partial job is being executed. - Fix added to identify actual number of threads spawned and allocate the work load to single thread in case of discrepancy in the number of threads spawned vs indicated. AMD-Internal: [CPUPL-2761] Change-Id: Ife36e6e4993bdcc5a506349b54b2177173866e32 --- frame/compat/bla_scal_amd.c | 75 +++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 24 deletions(-) diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 195a14ee5..10a402fc3 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -283,37 +283,64 @@ void dscal_blis_impl #pragma omp parallel num_threads( nt ) { - // The following conditions handle the optimal distribution of - // load among the threads. - // Say we have n0 = 50 & nt = 4. - // So we get 12 ( n0 / nt ) elements per thread along with 2 - // remaining elements. Each of these remaining elements is given - // to the last threads, respectively. - // So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements, - // respectively. + // Getting the actual number of threads that are spawned. + dim_t nt_real = omp_get_num_threads(); dim_t t_id = omp_get_thread_num(); - dim_t npt, offset; - if ( t_id < ( nt - n_elem_rem ) ) + // The actual number of threads spawned might be different + // from the predicted number of threads for which this parallel + // region is being generated. Thus, in such a case we are + // falling back to the Single-Threaded call. + if ( nt_real != nt ) { - npt = n_elem_per_thrd; - offset = t_id * npt * incx0; + // More than one thread can still be spawned but since we + // are falling back to the ST call, we are + // calling the kernel from thread 0 only. + if ( t_id == 0 ) + { + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n0, + (double*) alpha, + x0, incx0, + NULL + ); + } } else { - npt = n_elem_per_thrd + 1; - offset = ( ( t_id * n_elem_per_thrd ) + - ( t_id - ( nt - n_elem_rem ) ) ) * incx0; - } + // The following conditions handle the optimal distribution of + // load among the threads. + // Say we have n0 = 50 & nt = 4. + // So we get 12 ( n0 / nt ) elements per thread along with 2 + // remaining elements. Each of these remaining elements is given + // to the last threads, respectively. + // So, t0, t1, t2 and t3 gets 12, 12, 13 and 13 elements, + // respectively. + dim_t npt, offset; - bli_dscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - npt, - (double*) alpha, - x0 + offset, incx0, - NULL - ); + if ( t_id < ( nt - n_elem_rem ) ) + { + npt = n_elem_per_thrd; + offset = t_id * npt * incx0; + } + else + { + npt = n_elem_per_thrd + 1; + offset = ( ( t_id * n_elem_per_thrd ) + + ( t_id - ( nt - n_elem_rem ) ) ) * incx0; + } + + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + npt, + (double*) alpha, + x0 + offset, incx0, + NULL + ); + } } } #else