From f0824a04fc75e231c3a3d7757fa4e7294173282f Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 24 Mar 2014 15:21:42 -0500 Subject: [PATCH] Initial commit to enable threading in TRSM, Also enabled weighted partitioning for herk, trmm Fixed bug where multiple threads would try to modify the same state in the internal level 3 functions Correctly computed a_next and b_next for gemm, herk macrokernels a_next and b_next point to the current micropanels in trmm --- frame/3/gemm/bli_gemm_blk_var3f.c | 16 +-- frame/3/gemm/bli_gemm_int.c | 21 +++- frame/3/gemm/bli_gemm_ker_var2.c | 4 +- frame/3/gemm/bli_gemm_threading.c | 21 +--- frame/3/gemm/bli_gemm_threading.h | 4 + frame/3/herk/bli_herk_blk_var2f.c | 3 +- frame/3/herk/bli_herk_int.c | 18 ++- frame/3/herk/bli_herk_l_ker_var2.c | 4 +- frame/3/herk/bli_herk_threading.c | 10 +- frame/3/herk/bli_herk_threading.h | 5 + frame/3/herk/bli_herk_u_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_blk_var2b.c | 3 +- frame/3/trmm/bli_trmm_blk_var2f.c | 3 +- frame/3/trmm/bli_trmm_int.c | 18 ++- frame/3/trmm/bli_trmm_ll_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_threading.c | 10 +- frame/3/trsm/bli_trsm_blk_var1b.c | 76 +++++++----- frame/3/trsm/bli_trsm_blk_var1b.h | 3 +- frame/3/trsm/bli_trsm_blk_var1f.c | 72 +++++++----- frame/3/trsm/bli_trsm_blk_var1f.h | 3 +- frame/3/trsm/bli_trsm_blk_var2b.c | 113 +++++++++++------- frame/3/trsm/bli_trsm_blk_var2b.h | 3 +- frame/3/trsm/bli_trsm_blk_var2f.c | 114 +++++++++++------- frame/3/trsm/bli_trsm_blk_var2f.h | 3 +- frame/3/trsm/bli_trsm_blk_var3b.c | 112 +++++++++++------- frame/3/trsm/bli_trsm_blk_var3b.h | 3 +- frame/3/trsm/bli_trsm_blk_var3f.c | 112 +++++++++++------- frame/3/trsm/bli_trsm_blk_var3f.h | 3 +- frame/3/trsm/bli_trsm_front.c | 22 ++-- frame/3/trsm/bli_trsm_int.c | 30 +++-- frame/3/trsm/bli_trsm_int.h | 3 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_ll_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_lu_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_rl_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_ru_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_threading.c | 173 ++++++++++++++++++++++++++++ frame/3/trsm/bli_trsm_threading.h | 79 +++++++++++++ frame/base/bli_threading.c | 56 ++++++++- frame/base/bli_threading.h | 3 + 46 files changed, 889 insertions(+), 342 deletions(-) create mode 100644 frame/3/trsm/bli_trsm_threading.c create mode 100644 frame/3/trsm/bli_trsm_threading.h diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index f0647ccb3..a6a70181b 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -115,14 +115,6 @@ void bli_gemm_blk_var3f( obj_t* a, bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), gemm_thread_sub_ipackm( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); // Packing must be done before computation. thread_ibarrier( thread ); @@ -136,6 +128,14 @@ void bli_gemm_blk_var3f( obj_t* a, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread) ); + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c_pack is a local obj_t, we can simply overwrite the + // internal beta scalar with BLIS_ONE once it has been used in the + // first iteration. + if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + } thread_obarrier( thread ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 5218ab8c0..9c0adee84 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -79,7 +79,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -88,7 +90,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -106,23 +110,28 @@ void bli_gemm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } + thread_obarrier( thread ); // Extract the variant number and implementation type. n = cntl_var_num( cntl ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 7d0734e40..2d5cc7bca 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -249,11 +249,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 6d2ec5f1b..047b083cf 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -84,28 +84,17 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -dim_t read_env( char* env ) -{ - dim_t number = 1; - char* str = getenv( env ); - if( str != NULL ) - { - number = strtol( str, NULL, 10 ); - } - return number; -} - void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads ) { } gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index 54a8f4884..24bf6d734 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -53,6 +53,10 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_opackm( thread ) thread->opackm #define gemm_thread_sub_ipackm( thread ) thread->ipackm +// For use in gemm micro-kernel +#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ); void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** ); diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 3ef777247..5fcb56001 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -90,7 +90,8 @@ void bli_herk_blk_var2f( obj_t* a, dim_t start, end; // Needs to be replaced with a weighted range because triangle - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index 64fd7b1c4..0bc5c6a9b 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -89,7 +89,9 @@ void bli_herk_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -107,28 +109,34 @@ void bli_herk_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index c4d46718b..464e54588 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 942014883..2b291a924 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -90,11 +90,11 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads ) herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h index 05e038aab..d156547a8 100644 --- a/frame/3/herk/bli_herk_threading.h +++ b/frame/3/herk/bli_herk_threading.h @@ -53,6 +53,11 @@ typedef struct herk_thrinfo_s herk_thrinfo_t; #define herk_thread_sub_opackm( thread ) thread->opackm #define herk_thread_sub_ipackm( thread ) thread->ipackm +// For use in herk micro-kernel +#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + + herk_thrinfo_t** bli_create_herk_thrinfo_paths( ); void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 573738c0f..694f8a211 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 2a211bdbc..86787a80a 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -82,7 +82,8 @@ void bli_trmm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index f1ccedd45..39033fcf3 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -82,7 +82,8 @@ void bli_trmm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 56327008b..0148a670b 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -111,7 +111,9 @@ void bli_trmm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -129,22 +131,26 @@ void bli_trmm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -164,6 +170,8 @@ void bli_trmm_int( obj_t* alpha, else uplo = 1; } + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 772d91816..b5950a603 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -320,11 +320,11 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1011 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * ss_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -381,11 +381,11 @@ void PASTEMAC(ch,varname)( \ if( trmm_l_ir_my_iter( i, ir_thread ) ) \ { \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 0622bbbb2..e4568c70c 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -325,11 +325,11 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1112 * ss_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -385,11 +385,11 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index f48baf4b3..296325ec8 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b1121 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b1121 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 97626a717..7f13e47a8 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b0111 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b0111 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 0a9d83da2..3a6a7c0b4 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -90,11 +90,11 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads ) trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 6d4681f35..66b3e9fc7 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -60,22 +79,16 @@ void bli_trsm_blk_var1b( obj_t* a, // A begins. if ( bli_obj_is_upper( *a ) ) offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) - - bli_obj_width_after_trans( *a ); + bli_obj_width_after_trans( *a ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, m_trans, a, + b_alg = bli_determine_blocksize_b( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -84,29 +97,34 @@ void bli_trsm_blk_var1b( obj_t* a, bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); - //if ( bli_obj_is_zeros( a1 ) ) continue; - // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1b.h b/frame/3/trsm/bli_trsm_blk_var1b.h index 4ced0fc92..99585b947 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.h +++ b/frame/3/trsm/bli_trsm_blk_var1b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index 8177e183b..0525db3be 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -61,20 +80,14 @@ void bli_trsm_blk_var1f( obj_t* a, if ( bli_obj_is_lower( *a ) ) offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -84,26 +97,33 @@ void bli_trsm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1f.h b/frame/3/trsm/bli_trsm_blk_var1f.h index c815c03ff..48384c369 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.h +++ b/frame/3/trsm/bli_trsm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 724b88f2d..435c9dec3 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -37,40 +37,56 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, n_trans, b, @@ -83,38 +99,55 @@ void bli_trsm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2b.h b/frame/3/trsm/bli_trsm_blk_var2b.h index fb352ce39..de4a8f899 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.h +++ b/frame/3/trsm/bli_trsm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 5e57ecee8..43b46b752 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -37,40 +37,57 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, n_trans, b, @@ -83,38 +100,55 @@ void bli_trsm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2f.h b/frame/3/trsm/bli_trsm_blk_var2f.h index 44eb38460..ade7f0bf4 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.h +++ b/frame/3/trsm/bli_trsm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 252f2eef7..3e586cdfc 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -37,38 +37,51 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,45 +96,60 @@ void bli_trsm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3b.h b/frame/3/trsm/bli_trsm_blk_var3b.h index d8f6c8dc6..a1779dc67 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.h +++ b/frame/3/trsm/bli_trsm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index c59596090..2a3384a2b 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -37,38 +37,51 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,45 +96,60 @@ void bli_trsm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3f.h b/frame/3/trsm/bli_trsm_blk_var3f.h index 8546b0ba5..013d70bc1 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.h +++ b/frame/3/trsm/bli_trsm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 1dd67ece5..e7cae7d51 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -125,12 +125,20 @@ void bli_trsm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trsm_int( alpha, - &a_local, - &b_local, - alpha, - &c_local, - cntl ); + trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trsm_thrinfo_free_paths( infos ); } diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index db0fdf393..6644b3512 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -109,7 +111,9 @@ void bli_trsm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -127,14 +131,17 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -150,7 +157,8 @@ void bli_trsm_int( obj_t* alpha, // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) @@ -164,10 +172,13 @@ void bli_trsm_int( obj_t* alpha, // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &a_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &a_local ); } } + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); @@ -179,6 +190,7 @@ void bli_trsm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index 504f7928c..62a937b3c 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -37,4 +37,5 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index bb0ed34db..0d31f656b 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.h b/frame/3/trsm/bli_trsm_ll_ker_var2.h index 59e8e576b..d13ab6f23 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index d86a87ca0..6d0efe5e8 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.h b/frame/3/trsm/bli_trsm_lu_ker_var2.h index 50b18cf79..c26d0081a 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.h +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 5d0288c40..3bc951bd5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.h b/frame/3/trsm/bli_trsm_rl_ker_var2.h index a0605a7b7..8cc3c5fed 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.h +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 9bac5c946..6711ba423 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.h b/frame/3/trsm/bli_trsm_ru_ker_var2.h index ebb24b81f..c07b215af 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c new file mode 100644 index 000000000..08c915b15 --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trsm = sub_trsm; +} + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trsm = thread; +} + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) ); + bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trsm ); + return thread; +} + +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads ) +{ +} + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) +{ + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trsm_thrinfo_t** paths = (trsm_thrinfo_t**) malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trsm/bli_trsm_threading.h b/frame/3/trsm/bli_trsm_threading.h new file mode 100644 index 000000000..30bc612bf --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trsm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trsm_thrinfo_s* sub_trsm; +}; +typedef struct trsm_thrinfo_s trsm_thrinfo_t; + +#define trsm_thread_sub_trsm( thread ) thread->sub_trsm +#define trsm_thread_sub_opackm( thread ) thread->opackm +#define trsm_thread_sub_ipackm( thread ) thread->ipackm + +#define trsm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ); +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** ); + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ); diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index c0ef641ea..0b9ec30bd 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -230,8 +230,51 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto *end = bli_min( *start + n_pt, size + all_start ); } -void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) { + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + dim_t size = all_end - all_start; + + *start = all_start; + *end = all_end; + + if( forward ) { + dim_t curr_caucus = n_way - 1; + dim_t len = 0; + dim_t num = size*size / n_way; // 2xArea per thread? + while(1){ + dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if( curr_caucus == work_id ) { + if( *end > width ) + *start = *end - width; + return; + } + else{ + *end -= width; + len += width; + curr_caucus--; + } + } + } + else{ + dim_t len = *end - *start; + dim_t num = len * len / n_way; + while(1){ + dim_t width = sqrt(*start * *start + num) - *start; + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if(!work_id) { + *end = bli_min( *start + width, *end ); + return; + } + else{ + *start = *start + width; + } + work_id--; + } + } } void bli_level3_thread_decorator( dim_t n_threads, @@ -257,3 +300,14 @@ void bli_level3_thread_decorator( dim_t n_threads, thread[omp_id] ); } } + +dim_t bli_read_nway_from_env( char* env ) +{ + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index daaf2d6f4..f09da42c3 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -88,11 +88,13 @@ typedef struct thrinfo_s thrinfo_t; #define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end); thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); +dim_t bli_read_nway_from_env( char* env ); //void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); //thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); @@ -100,6 +102,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_gemm_threading.h" #include "bli_herk_threading.h" #include "bli_trmm_threading.h" +#include "bli_trsm_threading.h" typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); void bli_level3_thread_decorator( dim_t num_threads,