Fixed thrinfo_t printing bug for small problems.

Details:
- Fixed a bug in bli_l3_thrinfo_print_gemm_paths() and
  bli_l3_thrinfo_print_trsm_paths(), defined in bli_l3_thrinfo.c,
  whereby subnodes of the thrinfo_t tree are "dereferenced" near the
  beginning of the functions, which may lead to segfaults in certain
  situations where the thread tree was not fully formed because the
  matrix problem was too small for the level of parallelism specified.
  (That is, too small because some problems were assigned no work due
  to the smallest units in the m and n dimensions being defined by the
  register blocksizes mr and nr.) The fix requires several nested levels
  of if statements, and this is one of those few instances where use of
  goto statements results in (mostly) prettier code, especially in the
  case of _gemm_paths(). And while it wasn't necessary, I ported this
  goto usage to the loop body that prints the thrinfo_t work_id and
  comm_id values for each thread. Thanks to Nicholai Tukanov for helping
  to find this bug.
This commit is contained in:
Field G. Van Zee
2019-06-24 17:47:40 -05:00
parent c152109e9a
commit ceee2f973e

View File

@@ -99,35 +99,84 @@ void bli_l3_thrinfo_print_gemm_paths
thrinfo_t** threads
)
{
// In order to query the number of threads, we query the only thread we
// know exists: thread 0.
dim_t n_threads = bli_thread_num_threads( threads[0] );
dim_t gl_id;
thrinfo_t* jc_info = threads[0];
thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info );
thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info );
thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info );
thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info );
thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info );
thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info );
// For the purposes of printing the "header" information that is common
// to the various instances of a thrinfo_t (ie: across all threads), we
// choose the last thread in case the problem is so small that there is
// only an "edge" case, which will always be assigned to the last thread
// (at least for higher levels of partitioning).
thrinfo_t* jc_info = threads[n_threads-1];
thrinfo_t* pc_info = NULL;
thrinfo_t* pb_info = NULL;
thrinfo_t* ic_info = NULL;
thrinfo_t* pa_info = NULL;
thrinfo_t* jr_info = NULL;
thrinfo_t* ir_info = NULL;
dim_t jc_way = bli_thread_n_way( jc_info );
dim_t pc_way = bli_thread_n_way( pc_info );
dim_t pb_way = bli_thread_n_way( pb_info );
dim_t ic_way = bli_thread_n_way( ic_info );
dim_t pa_way = bli_thread_n_way( pa_info );
dim_t jr_way = bli_thread_n_way( jr_info );
dim_t ir_way = bli_thread_n_way( ir_info );
// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
// to -1. More than likely, these will all be overwritten with meaningful
// values, but in case some thrinfo_t trees are not fully built (see
// next commnet), these will be the placeholder values.
dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1,
pa_way = -1, jr_way = -1, ir_way = -1;
dim_t jc_nt = bli_thread_num_threads( jc_info );
dim_t pc_nt = bli_thread_num_threads( pc_info );
dim_t pb_nt = bli_thread_num_threads( pb_info );
dim_t ic_nt = bli_thread_num_threads( ic_info );
dim_t pa_nt = bli_thread_num_threads( pa_info );
dim_t jr_nt = bli_thread_num_threads( jr_info );
dim_t ir_nt = bli_thread_num_threads( ir_info );
dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1,
pa_nt = -1, jr_nt = -1, ir_nt = -1;
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
if ( !jc_info ) goto print_header;
jc_way = bli_thread_n_way( jc_info );
jc_nt = bli_thread_num_threads( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_header;
pc_way = bli_thread_n_way( pc_info );
pc_nt = bli_thread_num_threads( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_header;
pb_way = bli_thread_n_way( pb_info );
pb_nt = bli_thread_num_threads( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_header;
ic_way = bli_thread_n_way( ic_info );
ic_nt = bli_thread_num_threads( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
if ( !pa_info ) goto print_header;
pa_way = bli_thread_n_way( pa_info );
pa_nt = bli_thread_num_threads( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_header;
jr_way = bli_thread_n_way( jr_info );
jr_nt = bli_thread_num_threads( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_header;
ir_way = bli_thread_n_way( ir_info );
ir_nt = bli_thread_num_threads( ir_info );
print_header:
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
printf( "xx_nt: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( unsigned long )jc_nt,
( unsigned long )pc_nt,
( unsigned long )pb_nt,
@@ -135,7 +184,7 @@ void bli_l3_thrinfo_print_gemm_paths
( unsigned long )pa_nt,
( unsigned long )jr_nt,
( unsigned long )ir_nt );
printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( unsigned long )jc_way,
( unsigned long )pc_way,
( unsigned long )pb_way,
@@ -145,116 +194,59 @@ void bli_l3_thrinfo_print_gemm_paths
( unsigned long )ir_way );
printf( "============================================\n" );
dim_t jc_comm_id;
dim_t pc_comm_id;
dim_t pb_comm_id;
dim_t ic_comm_id;
dim_t pa_comm_id;
dim_t jr_comm_id;
dim_t ir_comm_id;
dim_t jc_work_id;
dim_t pc_work_id;
dim_t pb_work_id;
dim_t ic_work_id;
dim_t pa_work_id;
dim_t jr_work_id;
dim_t ir_work_id;
for ( gl_id = 0; gl_id < n_threads; ++gl_id )
for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
{
jc_info = threads[gl_id];
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
if ( !jc_info )
{
jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1,
pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1;
if ( !pc_info )
{
pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1,
pa_work_id = -1, jr_work_id = -1, ir_work_id = -1;
if ( !pb_info )
{
pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !jc_info ) goto print_thrinfo;
if ( !ic_info )
{
ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pa_info )
{
pa_comm_id = jr_comm_id = ir_comm_id = -1;
pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !pc_info ) goto print_thrinfo;
if ( !jr_info )
{
jr_comm_id = ir_comm_id = -1;
jr_work_id = ir_work_id = -1;
}
else
{
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !ir_info )
{
ir_comm_id = -1;
ir_work_id = -1;
}
else
{
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
}
}
}
}
}
}
}
if ( !pb_info ) goto print_thrinfo;
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_thrinfo;
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
if ( !pa_info ) goto print_thrinfo;
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_thrinfo;
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_thrinfo;
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
print_thrinfo:
//printf( " gl jc pb kc pa ic jr \n" );
//printf( " gl jc kc pb ic pa jr \n" );
printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( long )jc_comm_id,
( long )pc_comm_id,
@@ -285,44 +277,105 @@ void bli_l3_thrinfo_print_trsm_paths
thrinfo_t** threads
)
{
// In order to query the number of threads, we query the only thread we
// know exists: thread 0.
dim_t n_threads = bli_thread_num_threads( threads[0] );
dim_t gl_id;
thrinfo_t* jc_info = threads[0];
thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info );
thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info );
thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info );
// For the purposes of printing the "header" information that is common
// to the various instances of a thrinfo_t (ie: across all threads), we
// choose the last thread in case the problem is so small that there is
// only an "edge" case, which will always be assigned to the last thread
// (at least for higher levels of partitioning).
thrinfo_t* jc_info = threads[n_threads-1];
thrinfo_t* pc_info = NULL;
thrinfo_t* pb_info = NULL;
thrinfo_t* ic_info = NULL;
thrinfo_t* pa_info = NULL; thrinfo_t* pa_info0 = NULL;
thrinfo_t* jr_info = NULL; thrinfo_t* jr_info0 = NULL;
thrinfo_t* ir_info = NULL; thrinfo_t* ir_info0 = NULL;
thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info );
thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info );
thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info );
thrinfo_t* pa_info0 = bli_thrinfo_sub_prenode( ic_info );
thrinfo_t* jr_info0 = ( pa_info0 ? bli_thrinfo_sub_node( pa_info0 ) : NULL );
thrinfo_t* ir_info0 = ( jr_info0 ? bli_thrinfo_sub_node( jr_info0 ) : NULL );
// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
// to -1. More than likely, these will all be overwritten with meaningful
// values, but in case some thrinfo_t trees are not fully built (see
// next commnet), these will be the placeholder values.
dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1,
pa_way = -1, jr_way = -1, ir_way = -1,
pa_way0 = -1, jr_way0 = -1, ir_way0 = -1;
dim_t jc_way = bli_thread_n_way( jc_info );
dim_t pc_way = bli_thread_n_way( pc_info );
dim_t pb_way = bli_thread_n_way( pb_info );
dim_t ic_way = bli_thread_n_way( ic_info );
dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1,
pa_nt = -1, jr_nt = -1, ir_nt = -1,
pa_nt0 = -1, jr_nt0 = -1, ir_nt0 = -1;
dim_t pa_way = bli_thread_n_way( pa_info );
dim_t jr_way = bli_thread_n_way( jr_info );
dim_t ir_way = bli_thread_n_way( ir_info );
dim_t pa_way0 = ( pa_info0 ? bli_thread_n_way( pa_info0 ) : -1 );
dim_t jr_way0 = ( jr_info0 ? bli_thread_n_way( jr_info0 ) : -1 );
dim_t ir_way0 = ( ir_info0 ? bli_thread_n_way( ir_info0 ) : -1 );
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
dim_t jc_nt = bli_thread_num_threads( jc_info );
dim_t pc_nt = bli_thread_num_threads( pc_info );
dim_t pb_nt = bli_thread_num_threads( pb_info );
dim_t ic_nt = bli_thread_num_threads( ic_info );
if ( !jc_info ) goto print_header;
dim_t pa_nt = bli_thread_num_threads( pa_info );
dim_t jr_nt = bli_thread_num_threads( jr_info );
dim_t ir_nt = bli_thread_num_threads( ir_info );
dim_t pa_nt0 = ( pa_info0 ? bli_thread_num_threads( pa_info0 ) : -1 );
dim_t jr_nt0 = ( jr_info0 ? bli_thread_num_threads( jr_info0 ) : -1 );
dim_t ir_nt0 = ( ir_info0 ? bli_thread_num_threads( ir_info0 ) : -1 );
jc_way = bli_thread_n_way( jc_info );
jc_nt = bli_thread_num_threads( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_header;
pc_way = bli_thread_n_way( pc_info );
pc_nt = bli_thread_num_threads( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_header;
pb_way = bli_thread_n_way( pb_info );
pb_nt = bli_thread_num_threads( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_header;
ic_way = bli_thread_n_way( ic_info );
ic_nt = bli_thread_num_threads( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
pa_info0 = bli_thrinfo_sub_prenode( ic_info );
// check_header_prenode:
if ( !pa_info0 ) goto check_header_node;
pa_way0 = bli_thread_n_way( pa_info0 );
pa_nt0 = bli_thread_num_threads( pa_info0 );
jr_info0 = bli_thrinfo_sub_node( pa_info0 );
if ( !jr_info0 ) goto check_header_node;
jr_way0 = bli_thread_n_way( jr_info0 );
jr_nt0 = bli_thread_num_threads( jr_info0 );
ir_info0 = bli_thrinfo_sub_node( jr_info0 );
if ( !ir_info0 ) goto check_header_node;
ir_way0 = bli_thread_n_way( ir_info0 );
ir_nt0 = bli_thread_num_threads( ir_info0 );
check_header_node:
if ( !pa_info ) goto print_header;
pa_way = bli_thread_n_way( pa_info );
pa_nt = bli_thread_num_threads( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_header;
jr_way = bli_thread_n_way( jr_info );
jr_nt = bli_thread_num_threads( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_header;
ir_way = bli_thread_n_way( ir_info );
ir_nt = bli_thread_num_threads( ir_info );
print_header:
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
@@ -343,26 +396,105 @@ void bli_l3_thrinfo_print_trsm_paths
( long )ir_way0, ( long )ir_way );
printf( "==================================================\n" );
dim_t jc_comm_id;
dim_t pc_comm_id;
dim_t pb_comm_id;
dim_t ic_comm_id;
dim_t pa_comm_id0, pa_comm_id;
dim_t jr_comm_id0, jr_comm_id;
dim_t ir_comm_id0, ir_comm_id;
dim_t jc_work_id;
dim_t pc_work_id;
dim_t pb_work_id;
dim_t ic_work_id;
dim_t pa_work_id0, pa_work_id;
dim_t jr_work_id0, jr_work_id;
dim_t ir_work_id0, ir_work_id;
for ( gl_id = 0; gl_id < n_threads; ++gl_id )
for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
{
jc_info = threads[gl_id];
#if 1
// NOTE: This cpp branch contains code that is safe to execute
// for small problems that are parallelized enough that one or
// more threads gets no work.
dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1,
pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1,
pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1;
dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1,
pa_work_id = -1, jr_work_id = -1, ir_work_id = -1,
pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1;
if ( !jc_info ) goto print_thrinfo;
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_thrinfo;
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_thrinfo;
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_thrinfo;
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
pa_info0 = bli_thrinfo_sub_prenode( ic_info );
// check_thrinfo_prenode:
if ( !pa_info0 ) goto check_thrinfo_node;
pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
pa_work_id0 = bli_thread_work_id( pa_info0 );
jr_info0 = bli_thrinfo_sub_node( pa_info0 );
if ( !jr_info0 ) goto check_thrinfo_node;
jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
jr_work_id0 = bli_thread_work_id( jr_info0 );
ir_info0 = bli_thrinfo_sub_node( jr_info0 );
if ( !ir_info0 ) goto check_thrinfo_node;
ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
ir_work_id0 = bli_thread_work_id( ir_info0 );
check_thrinfo_node:
if ( !pa_info ) goto print_thrinfo;
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_thrinfo;
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_thrinfo;
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
print_thrinfo:
#else
dim_t jc_comm_id;
dim_t pc_comm_id;
dim_t pb_comm_id;
dim_t ic_comm_id;
dim_t pa_comm_id0, pa_comm_id;
dim_t jr_comm_id0, jr_comm_id;
dim_t ir_comm_id0, ir_comm_id;
dim_t jc_work_id;
dim_t pc_work_id;
dim_t pb_work_id;
dim_t ic_work_id;
dim_t pa_work_id0, pa_work_id;
dim_t jr_work_id0, jr_work_id;
dim_t ir_work_id0, ir_work_id;
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
@@ -488,6 +620,7 @@ void bli_l3_thrinfo_print_trsm_paths
}
}
}
#endif
printf( "comm ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
( long )jc_comm_id,