From 2f8a357de5fb55163a969d888cf059f24b78125c Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 23 Jul 2014 13:40:12 -0500 Subject: [PATCH] Some TRSM threading fixes/additions --- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trsm/bli_trsm_blk_var2b.c | 9 +++++---- frame/3/trsm/bli_trsm_blk_var2f.c | 9 +++++---- frame/3/trsm/bli_trsm_front.c | 4 ++-- frame/3/trsm/bli_trsm_int.c | 15 +++++---------- frame/3/trsm/bli_trsm_threading.c | 29 ++++++++++++++++------------- frame/3/trsm/bli_trsm_threading.h | 2 +- 7 files changed, 35 insertions(+), 35 deletions(-) diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index dea8b7771..90d621713 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -125,7 +125,7 @@ void bli_trmm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( !bli_is_left( side ) ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( bli_is_right( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 2ee269cee..7c48287a7 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -83,15 +83,16 @@ void bli_trsm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range_weighted( thread, 0, n_trans, - bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), - bli_obj_is_upper( *c ), &start, &end ); + bli_get_range( thread, 0, n_trans, + //bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + 8, + &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, n_trans, b, + b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 41ccc668f..e25fcb873 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -83,15 +83,16 @@ void bli_trsm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range_weighted( thread, 0, n_trans, - bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), - bli_obj_is_lower( *c ), &start, &end ); + bli_get_range( thread, 0, n_trans, + //bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + 8, + &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 2c42c24f9..cbafb6014 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -125,9 +125,9 @@ void bli_trsm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths(); + trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); - + // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trsm_int, diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index 4377b97ce..ebc48f307 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -131,17 +131,14 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - if( thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - } + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -157,8 +154,7 @@ void bli_trsm_int( obj_t* alpha, // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) @@ -172,8 +168,7 @@ void bli_trsm_int( obj_t* alpha, // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( alpha, &a_local ); + bli_obj_scalar_apply_scalar( alpha, &a_local ); } } diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c index 73832f4f8..152d40c1a 100644 --- a/frame/3/trsm/bli_trsm_threading.c +++ b/frame/3/trsm/bli_trsm_threading.c @@ -107,23 +107,26 @@ void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num ) bli_free( threads ); } -trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided ) { + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; + #ifdef BLIS_ENABLE_MULTITHREADING dim_t jc_in = bli_read_nway_from_env( "BLIS_JC_NT" ); /*dim_t kc_in = bli_read_nway_from_env( "BLIS_KC_NT" );*/ dim_t ic_in = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_in = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_in = bli_read_nway_from_env( "BLIS_IR_NT" ); - - dim_t jr_way = jc_in * ic_in * jr_in * ir_in; -#else - dim_t jr_way = 1; + + if(!right_sided){ + jc_way = jc_in; + jr_way = jr_in * ic_in * ir_in; + } #endif - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t ir_way = 1; dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); @@ -171,12 +174,12 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) NULL, NULL, ir_info); packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id, ic_comm, ic_comm_id, diff --git a/frame/3/trsm/bli_trsm_threading.h b/frame/3/trsm/bli_trsm_threading.h index 8dab87d90..3065062bc 100644 --- a/frame/3/trsm/bli_trsm_threading.h +++ b/frame/3/trsm/bli_trsm_threading.h @@ -55,7 +55,7 @@ typedef struct trsm_thrinfo_s trsm_thrinfo_t; #define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ); +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided ); void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** info, dim_t n_threads ); void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,