BLIS-Nov2022: HPL memory issues with GCC.

HPL script was using BLIS manual way to set threading, i.e. setting
BLIS_IC_NT explicitly. This causes bli_rntm_num_threads() to return
-1, which wasn't trapped in parallelised BLAS1 and BLAS2 routines.

Fix: if this occurs, set local number of threads based on product of
BLIS_JC_NT * BLIS_PC_NT * BLIS_IC_NT * BLIS_JR_NT * BLIS_IR_NT values.

Note: BLIS_PC_NT should always be 1, but this environment variable
is currently being read (contrary to documentation), so include it
for now.

Other changes:
* implement _Pragma convention in all code used on AMD
* frame/2/gemv/bli_gemv_unf_var1_amd.c: Remove is_omp_mt_enabled flag

AMD-Internal: [CPUPL-2803]
Change-Id: I37e8b038e5640d6693a87be0609888186322b465
This commit is contained in:
Edward Smyth
2022-12-05 08:16:57 -05:00
parent 5f1ea3246a
commit 7f86561d26
4 changed files with 62 additions and 13 deletions

View File

@@ -501,13 +501,6 @@ void bli_sgemv_unf_var1
return;
}
// If both multithreading and OpenMP are enabled, GEMV will multithread
#if defined(BLIS_ENABLE_MULTITHREADING) && defined(BLIS_ENABLE_OPENMP)
bool is_omp_mt_enabled = TRUE;
#else
bool is_omp_mt_enabled = FALSE;
#endif
dim_t nt_max;
rntm_t rnmt_obj;
@@ -517,9 +510,23 @@ void bli_sgemv_unf_var1
// Query the total number of threads from the rntm_t object.
nt_max = bli_rntm_num_threads( &rnmt_obj );
if ( ( nt_max > 1 ) & ( is_omp_mt_enabled == TRUE ) )
if (nt_max<=0)
{
// nt is less than one if BLIS manual setting of parallelism
// has been used. Parallelism here will be product of values.
dim_t jc, pc, ic, jr, ir;
jc = bli_rntm_jc_ways( &rnmt_obj );
pc = bli_rntm_pc_ways( &rnmt_obj );
ic = bli_rntm_ic_ways( &rnmt_obj );
jr = bli_rntm_jr_ways( &rnmt_obj );
ir = bli_rntm_ir_ways( &rnmt_obj );
nt_max = jc*pc*ic*jr*ir;
}
// If OpenMP is enabled, GEMV will multithread
#ifdef BLIS_ENABLE_OPENMP
if ( nt_max > 1 )
{
b_fuse = 4;
//Setting the thread count to the maximum number of threads provided
@@ -545,10 +552,10 @@ void bli_sgemv_unf_var1
cntx,
nt
);
#endif// BLIS_ENABLE_OPENMP
}
else
{
#endif// BLIS_ENABLE_OPENMP
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
@@ -575,7 +582,9 @@ void bli_sgemv_unf_var1
cntx
);
}
#ifdef BLIS_ENABLE_OPENMP
}
#endif// BLIS_ENABLE_OPENMP
}
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )

View File

@@ -307,6 +307,20 @@ double ddot_blis_impl
// Query the total number of threads from the rntm_t object.
nt = bli_rntm_num_threads(&rntm);
if (nt<=0)
{
// nt is less than one if BLIS manual setting of parallelism
// has been used. Parallelism here will be product of values.
dim_t jc, pc, ic, jr, ir;
jc = bli_rntm_jc_ways( &rntm );
pc = bli_rntm_pc_ways( &rntm );
ic = bli_rntm_ic_ways( &rntm );
jr = bli_rntm_jr_ways( &rntm );
ir = bli_rntm_ir_ways( &rntm );
nt = jc*pc*ic*jr*ir;
}
mem_t local_mem_buf = { 0 };
bli_membrk_rntm_set_membrk(&rntm);
@@ -348,7 +362,7 @@ double ddot_blis_impl
n0_per_thread = n0 / nt;
n0_rem = n0 % nt;
#pragma omp parallel num_threads(nt)
_Pragma( "omp parallel num_threads(nt)" )
{
// Getting the actual number of threads that are spawned.
dim_t nt_real = omp_get_num_threads();

View File

@@ -268,6 +268,19 @@ void dscal_blis_impl
bli_rntm_init_from_global( &rntm_local );
dim_t nt = bli_rntm_num_threads( &rntm_local );
if (nt<=0)
{
// nt is less than one if BLIS manual setting of parallelism
// has been used. Parallelism here will be product of values.
dim_t jc, pc, ic, jr, ir;
jc = bli_rntm_jc_ways( &rntm_local );
pc = bli_rntm_pc_ways( &rntm_local );
ic = bli_rntm_ic_ways( &rntm_local );
jr = bli_rntm_jr_ways( &rntm_local );
ir = bli_rntm_ir_ways( &rntm_local );
nt = jc*pc*ic*jr*ir;
}
#ifdef AOCL_DYNAMIC
dim_t nt_ideal;
@@ -281,7 +294,7 @@ void dscal_blis_impl
dim_t n_elem_per_thrd = n0 / nt;
dim_t n_elem_rem = n0 % nt;
#pragma omp parallel num_threads( nt )
_Pragma( "omp parallel num_threads(nt)" )
{
// Getting the actual number of threads that are spawned.
dim_t nt_real = omp_get_num_threads();
@@ -457,6 +470,19 @@ void zdscal_blis_impl
bli_rntm_init_from_global( &rntm_local );
dim_t nt = bli_rntm_num_threads( &rntm_local );
if (nt<=0)
{
// nt is less than one if BLIS manual setting of parallelism
// has been used. Parallelism here will be product of values.
dim_t jc, pc, ic, jr, ir;
jc = bli_rntm_jc_ways( &rntm_local );
pc = bli_rntm_pc_ways( &rntm_local );
ic = bli_rntm_ic_ways( &rntm_local );
jr = bli_rntm_jr_ways( &rntm_local );
ir = bli_rntm_ir_ways( &rntm_local );
nt = jc*pc*ic*jr*ir;
}
#ifdef AOCL_DYNAMIC
dim_t nt_ideal;
@@ -471,7 +497,7 @@ void zdscal_blis_impl
dim_t n_elem_per_thread = n0 / nt;
dim_t n_elem_rem = n0 % nt;
#pragma omp parallel num_threads( nt )
_Pragma( "omp parallel num_threads(nt)" )
{
// Getting the actual number of threads that are spawned.
dim_t nt_real = omp_get_num_threads();

View File

@@ -565,7 +565,7 @@ void bli_multi_sgemv_4x2
// Calculate the total number of multithreaded iteration
total_iteration = b_n / b_fuse;
#pragma omp parallel for num_threads(n_threads)
_Pragma( "omp parallel for num_threads(n_threads)" )
for (dim_t j = 0; j < total_iteration; j++)
{
float *A1 = a + (b_fuse * j) * lda;