diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c index a9534bd9a..278241aeb 100644 --- a/frame/2/gemv/bli_gemv_unf_var1_amd.c +++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c @@ -501,13 +501,6 @@ void bli_sgemv_unf_var1 return; } -// If both multithreading and OpenMP are enabled, GEMV will multithread -#if defined(BLIS_ENABLE_MULTITHREADING) && defined(BLIS_ENABLE_OPENMP) - bool is_omp_mt_enabled = TRUE; -#else - bool is_omp_mt_enabled = FALSE; -#endif - dim_t nt_max; rntm_t rnmt_obj; @@ -517,9 +510,23 @@ void bli_sgemv_unf_var1 // Query the total number of threads from the rntm_t object. nt_max = bli_rntm_num_threads( &rnmt_obj ); - if ( ( nt_max > 1 ) & ( is_omp_mt_enabled == TRUE ) ) + if (nt_max<=0) { + // nt is less than one if BLIS manual setting of parallelism + // has been used. Parallelism here will be product of values. + dim_t jc, pc, ic, jr, ir; + jc = bli_rntm_jc_ways( &rnmt_obj ); + pc = bli_rntm_pc_ways( &rnmt_obj ); + ic = bli_rntm_ic_ways( &rnmt_obj ); + jr = bli_rntm_jr_ways( &rnmt_obj ); + ir = bli_rntm_ir_ways( &rnmt_obj ); + nt_max = jc*pc*ic*jr*ir; + } + +// If OpenMP is enabled, GEMV will multithread #ifdef BLIS_ENABLE_OPENMP + if ( nt_max > 1 ) + { b_fuse = 4; //Setting the thread count to the maximum number of threads provided @@ -545,10 +552,10 @@ void bli_sgemv_unf_var1 cntx, nt ); -#endif// BLIS_ENABLE_OPENMP } else { +#endif// BLIS_ENABLE_OPENMP b_fuse = 8; for ( i = 0; i < n_iter; i += f ) @@ -575,7 +582,9 @@ void bli_sgemv_unf_var1 cntx ); } +#ifdef BLIS_ENABLE_OPENMP } +#endif// BLIS_ENABLE_OPENMP } INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 ) diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c index e9536de3c..ecffae490 100644 --- a/frame/compat/bla_dot_amd.c +++ b/frame/compat/bla_dot_amd.c @@ -307,6 +307,20 @@ double ddot_blis_impl // Query the total number of threads from the rntm_t object. nt = bli_rntm_num_threads(&rntm); + + if (nt<=0) + { + // nt is less than one if BLIS manual setting of parallelism + // has been used. Parallelism here will be product of values. + dim_t jc, pc, ic, jr, ir; + jc = bli_rntm_jc_ways( &rntm ); + pc = bli_rntm_pc_ways( &rntm ); + ic = bli_rntm_ic_ways( &rntm ); + jr = bli_rntm_jr_ways( &rntm ); + ir = bli_rntm_ir_ways( &rntm ); + nt = jc*pc*ic*jr*ir; + } + mem_t local_mem_buf = { 0 }; bli_membrk_rntm_set_membrk(&rntm); @@ -348,7 +362,7 @@ double ddot_blis_impl n0_per_thread = n0 / nt; n0_rem = n0 % nt; - #pragma omp parallel num_threads(nt) + _Pragma( "omp parallel num_threads(nt)" ) { // Getting the actual number of threads that are spawned. dim_t nt_real = omp_get_num_threads(); diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 997e03af3..71301a2e0 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -268,6 +268,19 @@ void dscal_blis_impl bli_rntm_init_from_global( &rntm_local ); dim_t nt = bli_rntm_num_threads( &rntm_local ); + if (nt<=0) + { + // nt is less than one if BLIS manual setting of parallelism + // has been used. Parallelism here will be product of values. + dim_t jc, pc, ic, jr, ir; + jc = bli_rntm_jc_ways( &rntm_local ); + pc = bli_rntm_pc_ways( &rntm_local ); + ic = bli_rntm_ic_ways( &rntm_local ); + jr = bli_rntm_jr_ways( &rntm_local ); + ir = bli_rntm_ir_ways( &rntm_local ); + nt = jc*pc*ic*jr*ir; + } + #ifdef AOCL_DYNAMIC dim_t nt_ideal; @@ -281,7 +294,7 @@ void dscal_blis_impl dim_t n_elem_per_thrd = n0 / nt; dim_t n_elem_rem = n0 % nt; - #pragma omp parallel num_threads( nt ) + _Pragma( "omp parallel num_threads(nt)" ) { // Getting the actual number of threads that are spawned. dim_t nt_real = omp_get_num_threads(); @@ -457,6 +470,19 @@ void zdscal_blis_impl bli_rntm_init_from_global( &rntm_local ); dim_t nt = bli_rntm_num_threads( &rntm_local ); + if (nt<=0) + { + // nt is less than one if BLIS manual setting of parallelism + // has been used. Parallelism here will be product of values. + dim_t jc, pc, ic, jr, ir; + jc = bli_rntm_jc_ways( &rntm_local ); + pc = bli_rntm_pc_ways( &rntm_local ); + ic = bli_rntm_ic_ways( &rntm_local ); + jr = bli_rntm_jr_ways( &rntm_local ); + ir = bli_rntm_ir_ways( &rntm_local ); + nt = jc*pc*ic*jr*ir; + } + #ifdef AOCL_DYNAMIC dim_t nt_ideal; @@ -471,7 +497,7 @@ void zdscal_blis_impl dim_t n_elem_per_thread = n0 / nt; dim_t n_elem_rem = n0 % nt; - #pragma omp parallel num_threads( nt ) + _Pragma( "omp parallel num_threads(nt)" ) { // Getting the actual number of threads that are spawned. dim_t nt_real = omp_get_num_threads(); diff --git a/kernels/zen/2/bli_gemv_zen_int_4.c b/kernels/zen/2/bli_gemv_zen_int_4.c index a4bdfb449..6970a7f62 100644 --- a/kernels/zen/2/bli_gemv_zen_int_4.c +++ b/kernels/zen/2/bli_gemv_zen_int_4.c @@ -565,7 +565,7 @@ void bli_multi_sgemv_4x2 // Calculate the total number of multithreaded iteration total_iteration = b_n / b_fuse; -#pragma omp parallel for num_threads(n_threads) + _Pragma( "omp parallel for num_threads(n_threads)" ) for (dim_t j = 0; j < total_iteration; j++) { float *A1 = a + (b_fuse * j) * lda;