diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 1d876d50f..4f073cb20 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -99,35 +99,84 @@ void bli_l3_thrinfo_print_gemm_paths thrinfo_t** threads ) { + // In order to query the number of threads, we query the only thread we + // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); - dim_t gl_id; - thrinfo_t* jc_info = threads[0]; - thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); - thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); - thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); - thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); - thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); - thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); + // For the purposes of printing the "header" information that is common + // to the various instances of a thrinfo_t (ie: across all threads), we + // choose the last thread in case the problem is so small that there is + // only an "edge" case, which will always be assigned to the last thread + // (at least for higher levels of partitioning). + thrinfo_t* jc_info = threads[n_threads-1]; + thrinfo_t* pc_info = NULL; + thrinfo_t* pb_info = NULL; + thrinfo_t* ic_info = NULL; + thrinfo_t* pa_info = NULL; + thrinfo_t* jr_info = NULL; + thrinfo_t* ir_info = NULL; - dim_t jc_way = bli_thread_n_way( jc_info ); - dim_t pc_way = bli_thread_n_way( pc_info ); - dim_t pb_way = bli_thread_n_way( pb_info ); - dim_t ic_way = bli_thread_n_way( ic_info ); - dim_t pa_way = bli_thread_n_way( pa_info ); - dim_t jr_way = bli_thread_n_way( jr_info ); - dim_t ir_way = bli_thread_n_way( ir_info ); + // Initialize the n_ways and n_threads fields of each thrinfo_t "level" + // to -1. More than likely, these will all be overwritten with meaningful + // values, but in case some thrinfo_t trees are not fully built (see + // next commnet), these will be the placeholder values. + dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, + pa_way = -1, jr_way = -1, ir_way = -1; - dim_t jc_nt = bli_thread_num_threads( jc_info ); - dim_t pc_nt = bli_thread_num_threads( pc_info ); - dim_t pb_nt = bli_thread_num_threads( pb_info ); - dim_t ic_nt = bli_thread_num_threads( ic_info ); - dim_t pa_nt = bli_thread_num_threads( pa_info ); - dim_t jr_nt = bli_thread_num_threads( jr_info ); - dim_t ir_nt = bli_thread_num_threads( ir_info ); + dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, + pa_nt = -1, jr_nt = -1, ir_nt = -1; + + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads + // may not fully build their thrinfo_t structures--specifically when the + // dimension being parallelized is not large enough for each thread to have + // even one unit of work (where as unit is usually a single micropanel's + // width, MR or NR). + + if ( !jc_info ) goto print_header; + + jc_way = bli_thread_n_way( jc_info ); + jc_nt = bli_thread_num_threads( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_header; + + pc_way = bli_thread_n_way( pc_info ); + pc_nt = bli_thread_num_threads( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_header; + + pb_way = bli_thread_n_way( pb_info ); + pb_nt = bli_thread_num_threads( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_header; + + ic_way = bli_thread_n_way( ic_info ); + ic_nt = bli_thread_num_threads( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + + if ( !pa_info ) goto print_header; + + pa_way = bli_thread_n_way( pa_info ); + pa_nt = bli_thread_num_threads( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_header; + + jr_way = bli_thread_n_way( jr_info ); + jr_nt = bli_thread_num_threads( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_header; + + ir_way = bli_thread_n_way( ir_info ); + ir_nt = bli_thread_num_threads( ir_info ); + + print_header: printf( " jc kc pb ic pa jr ir\n" ); - printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + printf( "xx_nt: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, @@ -135,7 +184,7 @@ void bli_l3_thrinfo_print_gemm_paths ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )ir_nt ); - printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, @@ -145,116 +194,59 @@ void bli_l3_thrinfo_print_gemm_paths ( unsigned long )ir_way ); printf( "============================================\n" ); - dim_t jc_comm_id; - dim_t pc_comm_id; - dim_t pb_comm_id; - dim_t ic_comm_id; - dim_t pa_comm_id; - dim_t jr_comm_id; - dim_t ir_comm_id; - - dim_t jc_work_id; - dim_t pc_work_id; - dim_t pb_work_id; - dim_t ic_work_id; - dim_t pa_work_id; - dim_t jr_work_id; - dim_t ir_work_id; - - for ( gl_id = 0; gl_id < n_threads; ++gl_id ) + for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; - // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads - // may not fully build their thrinfo_t structures--specifically when the - // dimension being parallelized is not large enough for each thread to have - // even one unit of work (where as unit is usually a single micropanel's - // width, MR or NR). - if ( !jc_info ) - { - jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - jc_comm_id = bli_thread_ocomm_id( jc_info ); - jc_work_id = bli_thread_work_id( jc_info ); - pc_info = bli_thrinfo_sub_node( jc_info ); + dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, + pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1; - if ( !pc_info ) - { - pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pc_comm_id = bli_thread_ocomm_id( pc_info ); - pc_work_id = bli_thread_work_id( pc_info ); - pb_info = bli_thrinfo_sub_node( pc_info ); + dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, + pa_work_id = -1, jr_work_id = -1, ir_work_id = -1; - if ( !pb_info ) - { - pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pb_comm_id = bli_thread_ocomm_id( pb_info ); - pb_work_id = bli_thread_work_id( pb_info ); - ic_info = bli_thrinfo_sub_node( pb_info ); + if ( !jc_info ) goto print_thrinfo; - if ( !ic_info ) - { - ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - ic_comm_id = bli_thread_ocomm_id( ic_info ); - ic_work_id = bli_thread_work_id( ic_info ); - pa_info = bli_thrinfo_sub_node( ic_info ); + jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_work_id = bli_thread_work_id( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); - if ( !pa_info ) - { - pa_comm_id = jr_comm_id = ir_comm_id = -1; - pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pa_comm_id = bli_thread_ocomm_id( pa_info ); - pa_work_id = bli_thread_work_id( pa_info ); - jr_info = bli_thrinfo_sub_node( pa_info ); + if ( !pc_info ) goto print_thrinfo; - if ( !jr_info ) - { - jr_comm_id = ir_comm_id = -1; - jr_work_id = ir_work_id = -1; - } - else - { - jr_comm_id = bli_thread_ocomm_id( jr_info ); - jr_work_id = bli_thread_work_id( jr_info ); - ir_info = bli_thrinfo_sub_node( jr_info ); + pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_work_id = bli_thread_work_id( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); - if ( !ir_info ) - { - ir_comm_id = -1; - ir_work_id = -1; - } - else - { - ir_comm_id = bli_thread_ocomm_id( ir_info ); - ir_work_id = bli_thread_work_id( ir_info ); - } - } - } - } - } - } - } + if ( !pb_info ) goto print_thrinfo; + + pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_work_id = bli_thread_work_id( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_thrinfo; + + ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_work_id = bli_thread_work_id( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + + if ( !pa_info ) goto print_thrinfo; + + pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_work_id = bli_thread_work_id( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_thrinfo; + + jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_work_id = bli_thread_work_id( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_thrinfo; + + ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_work_id = bli_thread_work_id( ir_info ); + + print_thrinfo: - //printf( " gl jc pb kc pa ic jr \n" ); - //printf( " gl jc kc pb ic pa jr \n" ); printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( long )jc_comm_id, ( long )pc_comm_id, @@ -285,44 +277,105 @@ void bli_l3_thrinfo_print_trsm_paths thrinfo_t** threads ) { + // In order to query the number of threads, we query the only thread we + // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); - dim_t gl_id; - thrinfo_t* jc_info = threads[0]; - thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); - thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); - thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); + // For the purposes of printing the "header" information that is common + // to the various instances of a thrinfo_t (ie: across all threads), we + // choose the last thread in case the problem is so small that there is + // only an "edge" case, which will always be assigned to the last thread + // (at least for higher levels of partitioning). + thrinfo_t* jc_info = threads[n_threads-1]; + thrinfo_t* pc_info = NULL; + thrinfo_t* pb_info = NULL; + thrinfo_t* ic_info = NULL; + thrinfo_t* pa_info = NULL; thrinfo_t* pa_info0 = NULL; + thrinfo_t* jr_info = NULL; thrinfo_t* jr_info0 = NULL; + thrinfo_t* ir_info = NULL; thrinfo_t* ir_info0 = NULL; - thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); - thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); - thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); - thrinfo_t* pa_info0 = bli_thrinfo_sub_prenode( ic_info ); - thrinfo_t* jr_info0 = ( pa_info0 ? bli_thrinfo_sub_node( pa_info0 ) : NULL ); - thrinfo_t* ir_info0 = ( jr_info0 ? bli_thrinfo_sub_node( jr_info0 ) : NULL ); + // Initialize the n_ways and n_threads fields of each thrinfo_t "level" + // to -1. More than likely, these will all be overwritten with meaningful + // values, but in case some thrinfo_t trees are not fully built (see + // next commnet), these will be the placeholder values. + dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, + pa_way = -1, jr_way = -1, ir_way = -1, + pa_way0 = -1, jr_way0 = -1, ir_way0 = -1; - dim_t jc_way = bli_thread_n_way( jc_info ); - dim_t pc_way = bli_thread_n_way( pc_info ); - dim_t pb_way = bli_thread_n_way( pb_info ); - dim_t ic_way = bli_thread_n_way( ic_info ); + dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, + pa_nt = -1, jr_nt = -1, ir_nt = -1, + pa_nt0 = -1, jr_nt0 = -1, ir_nt0 = -1; - dim_t pa_way = bli_thread_n_way( pa_info ); - dim_t jr_way = bli_thread_n_way( jr_info ); - dim_t ir_way = bli_thread_n_way( ir_info ); - dim_t pa_way0 = ( pa_info0 ? bli_thread_n_way( pa_info0 ) : -1 ); - dim_t jr_way0 = ( jr_info0 ? bli_thread_n_way( jr_info0 ) : -1 ); - dim_t ir_way0 = ( ir_info0 ? bli_thread_n_way( ir_info0 ) : -1 ); + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads + // may not fully build their thrinfo_t structures--specifically when the + // dimension being parallelized is not large enough for each thread to have + // even one unit of work (where as unit is usually a single micropanel's + // width, MR or NR). - dim_t jc_nt = bli_thread_num_threads( jc_info ); - dim_t pc_nt = bli_thread_num_threads( pc_info ); - dim_t pb_nt = bli_thread_num_threads( pb_info ); - dim_t ic_nt = bli_thread_num_threads( ic_info ); + if ( !jc_info ) goto print_header; - dim_t pa_nt = bli_thread_num_threads( pa_info ); - dim_t jr_nt = bli_thread_num_threads( jr_info ); - dim_t ir_nt = bli_thread_num_threads( ir_info ); - dim_t pa_nt0 = ( pa_info0 ? bli_thread_num_threads( pa_info0 ) : -1 ); - dim_t jr_nt0 = ( jr_info0 ? bli_thread_num_threads( jr_info0 ) : -1 ); - dim_t ir_nt0 = ( ir_info0 ? bli_thread_num_threads( ir_info0 ) : -1 ); + jc_way = bli_thread_n_way( jc_info ); + jc_nt = bli_thread_num_threads( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_header; + + pc_way = bli_thread_n_way( pc_info ); + pc_nt = bli_thread_num_threads( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_header; + + pb_way = bli_thread_n_way( pb_info ); + pb_nt = bli_thread_num_threads( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_header; + + ic_way = bli_thread_n_way( ic_info ); + ic_nt = bli_thread_num_threads( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + pa_info0 = bli_thrinfo_sub_prenode( ic_info ); + + // check_header_prenode: + + if ( !pa_info0 ) goto check_header_node; + + pa_way0 = bli_thread_n_way( pa_info0 ); + pa_nt0 = bli_thread_num_threads( pa_info0 ); + jr_info0 = bli_thrinfo_sub_node( pa_info0 ); + + if ( !jr_info0 ) goto check_header_node; + + jr_way0 = bli_thread_n_way( jr_info0 ); + jr_nt0 = bli_thread_num_threads( jr_info0 ); + ir_info0 = bli_thrinfo_sub_node( jr_info0 ); + + if ( !ir_info0 ) goto check_header_node; + + ir_way0 = bli_thread_n_way( ir_info0 ); + ir_nt0 = bli_thread_num_threads( ir_info0 ); + + check_header_node: + + if ( !pa_info ) goto print_header; + + pa_way = bli_thread_n_way( pa_info ); + pa_nt = bli_thread_num_threads( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_header; + + jr_way = bli_thread_n_way( jr_info ); + jr_nt = bli_thread_num_threads( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_header; + + ir_way = bli_thread_n_way( ir_info ); + ir_nt = bli_thread_num_threads( ir_info ); + + print_header: printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", @@ -343,26 +396,105 @@ void bli_l3_thrinfo_print_trsm_paths ( long )ir_way0, ( long )ir_way ); printf( "==================================================\n" ); - dim_t jc_comm_id; - dim_t pc_comm_id; - dim_t pb_comm_id; - dim_t ic_comm_id; - dim_t pa_comm_id0, pa_comm_id; - dim_t jr_comm_id0, jr_comm_id; - dim_t ir_comm_id0, ir_comm_id; - dim_t jc_work_id; - dim_t pc_work_id; - dim_t pb_work_id; - dim_t ic_work_id; - dim_t pa_work_id0, pa_work_id; - dim_t jr_work_id0, jr_work_id; - dim_t ir_work_id0, ir_work_id; - - for ( gl_id = 0; gl_id < n_threads; ++gl_id ) + for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; +#if 1 + // NOTE: This cpp branch contains code that is safe to execute + // for small problems that are parallelized enough that one or + // more threads gets no work. + + dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, + pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1, + pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1; + + dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, + pa_work_id = -1, jr_work_id = -1, ir_work_id = -1, + pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1; + + if ( !jc_info ) goto print_thrinfo; + + jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_work_id = bli_thread_work_id( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_thrinfo; + + pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_work_id = bli_thread_work_id( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_thrinfo; + + pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_work_id = bli_thread_work_id( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_thrinfo; + + ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_work_id = bli_thread_work_id( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + pa_info0 = bli_thrinfo_sub_prenode( ic_info ); + + // check_thrinfo_prenode: + + if ( !pa_info0 ) goto check_thrinfo_node; + + pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); + pa_work_id0 = bli_thread_work_id( pa_info0 ); + jr_info0 = bli_thrinfo_sub_node( pa_info0 ); + + if ( !jr_info0 ) goto check_thrinfo_node; + + jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); + jr_work_id0 = bli_thread_work_id( jr_info0 ); + ir_info0 = bli_thrinfo_sub_node( jr_info0 ); + + if ( !ir_info0 ) goto check_thrinfo_node; + + ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); + ir_work_id0 = bli_thread_work_id( ir_info0 ); + + check_thrinfo_node: + + if ( !pa_info ) goto print_thrinfo; + + pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_work_id = bli_thread_work_id( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_thrinfo; + + jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_work_id = bli_thread_work_id( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_thrinfo; + + ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_work_id = bli_thread_work_id( ir_info ); + + print_thrinfo: +#else + dim_t jc_comm_id; + dim_t pc_comm_id; + dim_t pb_comm_id; + dim_t ic_comm_id; + dim_t pa_comm_id0, pa_comm_id; + dim_t jr_comm_id0, jr_comm_id; + dim_t ir_comm_id0, ir_comm_id; + + dim_t jc_work_id; + dim_t pc_work_id; + dim_t pb_work_id; + dim_t ic_work_id; + dim_t pa_work_id0, pa_work_id; + dim_t jr_work_id0, jr_work_id; + dim_t ir_work_id0, ir_work_id; + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads // may not fully build their thrinfo_t structures--specifically when the // dimension being parallelized is not large enough for each thread to have @@ -488,6 +620,7 @@ void bli_l3_thrinfo_print_trsm_paths } } } +#endif printf( "comm ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_comm_id,