Files
blis/frame/base/bli_rntm.c
Field G. Van Zee 00e14cb6d8 Replaced use of bool_t type with C99 bool.
Details:
- Textually replaced nearly all non-comment instances of bool_t with the
  C99 bool type. A few remaining instances, such as those in the files
  bli_herk_x_ker_var2.c, bli_trmm_xx_ker_var2.c, and
  bli_trsm_xx_ker_var2.c, were promoted to dim_t since they were being
  used not for boolean purposes but to index into an array.
- This commit constitutes the third phase of a transition toward using
  C99's bool instead of bool_t, which was raised in issue #420. The first
  phase, which cleaned up various typecasts in preparation for using
  bool as the basis for bool_t (instead of gint_t), was implemented by
  commit a69a4d7. The second phase, which redefined the bool_t typedef
  in terms of bool (from gint_t), was implemented by commit 2c554c2.
2020-07-29 14:24:34 -05:00

474 lines
13 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// The global rntm_t structure, which holds the global thread settings
// along with a few other key parameters.
rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm.
bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
// ----------------------------------------------------------------------------
void bli_rntm_init_from_global( rntm_t* rntm )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
*rntm = global_rntm;
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// -----------------------------------------------------------------------------
void bli_rntm_set_ways_for_op
(
opid_t l3_op,
side_t side,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
// Set the number of ways for each loop, if needed, depending on what
// kind of information is already stored in the rntm_t object.
bli_rntm_set_ways_from_rntm( m, n, k, rntm );
#if 0
printf( "bli_rntm_set_ways_for_op()\n" );
bli_rntm_print( rntm );
#endif
// Now modify the number of ways, if necessary, based on the operation.
if ( l3_op == BLIS_TRMM ||
l3_op == BLIS_TRSM )
{
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
// Notice that, if we do need to update the ways, we don't need to
// update the num_threads field since we only reshuffle where the
// parallelism is extracted, not the total amount of parallelism.
if ( l3_op == BLIS_TRMM )
{
// We reconfigure the parallelism extracted from trmm_r due to a
// dependency in the jc loop. (NOTE: This dependency does not exist
// for trmm3.)
if ( bli_is_left( side ) )
{
bli_rntm_set_ways_only
(
jc,
pc,
ic,
jr,
ir,
rntm
);
}
else // if ( bli_is_right( side ) )
{
bli_rntm_set_ways_only
(
1,
pc,
ic,
jr * jc,
ir,
rntm
);
}
}
else if ( l3_op == BLIS_TRSM )
{
//printf( "bli_rntm_set_ways_for_op(): jc%d ic%d jr%d\n", (int)jc, (int)ic, (int)jr );
if ( bli_is_left( side ) )
{
bli_rntm_set_ways_only
(
jc,
1,
ic * pc,
jr * ir,
1,
rntm
);
}
else // if ( bli_is_right( side ) )
{
bli_rntm_set_ways_only
(
1,
1,
ic * pc * jc * ir * jr,
1,
1,
rntm
);
}
}
}
}
void bli_rntm_set_ways_from_rntm
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool nt_set = FALSE;
bool ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
// default the non-positive values to 1.
if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
{
ways_set = TRUE;
if ( jc < 1 ) jc = 1;
if ( pc < 1 ) pc = 1;
if ( ic < 1 ) ic = 1;
if ( jr < 1 ) jr = 1;
if ( ir < 1 ) ir = 1;
}
// Now we use the values of nt_set and ways_set to determine how to
// interpret the original values we found in the rntm_t object.
if ( ways_set == TRUE )
{
// If the ways were set, then we use the values that were given
// and interpreted above (we set any non-positive value to 1).
// The only thing left to do is calculate the correct number of
// threads.
nt = jc * pc * ic * jr * ir;
}
else if ( ways_set == FALSE && nt_set == TRUE )
{
// If the ways were not set but the number of threas was set, then
// we attempt to automatically generate a thread factorization that
// will work given the problem size. Thus, here we only set the
// ways and leave the number of threads unchanged.
pc = 1;
bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
n*BLIS_THREAD_RATIO_N, &ic, &jc );
for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
{
if ( ic % ir == 0 ) { ic /= ir; break; }
}
for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- )
{
if ( jc % jr == 0 ) { jc /= jr; break; }
}
}
else // if ( ways_set == FALSE && nt_set == FALSE )
{
// If neither the ways nor the number of threads were set, then
// the rntm was not meaningfully changed since initialization,
// and thus we'll default to single-threaded execution.
nt = 1;
jc = pc = ic = jr = ir = 1;
}
#else
// When multithreading is disabled, always set the rntm_t ways
// values to 1.
nt = 1;
jc = pc = ic = jr = ir = 1;
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_set_ways_from_rntm_sup
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool nt_set = FALSE;
bool ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
// default the non-positive values to 1.
if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
{
ways_set = TRUE;
if ( jc < 1 ) jc = 1;
if ( pc < 1 ) pc = 1;
if ( ic < 1 ) ic = 1;
if ( jr < 1 ) jr = 1;
if ( ir < 1 ) ir = 1;
}
// Now we use the values of nt_set and ways_set to determine how to
// interpret the original values we found in the rntm_t object.
if ( ways_set == TRUE )
{
// If the ways were set, then we use the values that were given
// and interpreted above (we set any non-positive value to 1).
// The only thing left to do is calculate the correct number of
// threads.
nt = jc * pc * ic * jr * ir;
}
else if ( ways_set == FALSE && nt_set == TRUE )
{
// If the ways were not set but the number of threas was set, then
// we attempt to automatically generate a thread factorization that
// will work given the problem size. Thus, here we only set the
// ways and leave the number of threads unchanged.
pc = 1;
//bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M,
// n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc );
bli_thread_partition_2x2( nt, m,
n, &ic, &jc );
//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", (int)jc, (int)ic );
#if 0
for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- )
{
if ( ic % ir == 0 ) { ic /= ir; break; }
}
for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- )
{
if ( jc % jr == 0 ) { jc /= jr; break; }
}
#else
ir = 1;
jr = 1;
#endif
}
else // if ( ways_set == FALSE && nt_set == FALSE )
{
// If neither the ways nor the number of threads were set, then
// the rntm was not meaningfully changed since initialization,
// and thus we'll default to single-threaded execution.
nt = 1;
jc = pc = ic = jr = ir = 1;
}
#else
// When multithreading is disabled, always set the rntm_t ways
// values to 1.
nt = 1;
jc = pc = ic = jr = ir = 1;
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_print
(
rntm_t* rntm
)
{
dim_t af = bli_rntm_auto_factor( rntm );
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
printf( "rntm contents nt jc pc ic jr ir\n" );
printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af,
(int)nt, (int)jc, (int)pc,
(int)ic, (int)jr, (int)ir );
}
// -----------------------------------------------------------------------------
dim_t bli_rntm_calc_num_threads_in
(
bszid_t* restrict bszid_cur,
rntm_t* restrict rntm
)
{
/* // bp algorithm:
bszid_t bszids[7] = { BLIS_NC, // level 0: 5th loop
BLIS_KC, // level 1: 4th loop
BLIS_NO_PART, // level 2: pack B
BLIS_MC, // level 3: 3rd loop
BLIS_NO_PART, // level 4: pack A
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
... // pb algorithm:
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
}; */
dim_t n_threads_in = 1;
// Starting with the current element of the bszids array (pointed
// to by bszid_cur), multiply all of the corresponding ways of
// parallelism.
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
//if ( bszid == BLIS_KR ) break;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
{
const dim_t cur_way = bli_rntm_ways_for( bszid, rntm );
n_threads_in *= cur_way;
}
}
return n_threads_in;
}
#if 0
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
dim_t cur_way = 1;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_rntm_ways_for( bszid, rntm );
else
cur_way = 1;
n_threads_in *= cur_way;
}
#endif