From 5d5dc2eedef2f7c90d61371a1b457be5c06cf583 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 20 Mar 2014 16:43:36 -0500 Subject: [PATCH] Parallelized trmm and trmm3 Also fixed bugs in packm --- frame/1m/packm/bli_packm_blk_var1.c | 23 ++- frame/1m/packm/bli_packm_blk_var3.c | 19 ++- frame/1m/packm/bli_packm_blk_var4.c | 19 ++- frame/1m/packm/bli_packm_threading.h | 2 + frame/3/gemm/bli_gemm_blk_var1f.c | 2 +- frame/3/gemm/bli_gemm_blk_var2f.c | 2 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/her2k/bli_her2k_front.c | 4 +- frame/3/herk/bli_herk_blk_var1f.c | 2 +- frame/3/herk/bli_herk_blk_var2f.c | 2 +- frame/3/herk/bli_herk_front.c | 2 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/syr2k/bli_syr2k_front.c | 4 +- frame/3/syrk/bli_syrk_front.c | 2 +- frame/3/trmm/bli_trmm_blk_var1f.c | 111 +++++++++----- frame/3/trmm/bli_trmm_blk_var1f.h | 3 +- frame/3/trmm/bli_trmm_blk_var2b.c | 116 +++++++++----- frame/3/trmm/bli_trmm_blk_var2b.h | 3 +- frame/3/trmm/bli_trmm_blk_var2f.c | 116 +++++++++----- frame/3/trmm/bli_trmm_blk_var2f.h | 3 +- frame/3/trmm/bli_trmm_blk_var3b.c | 109 ++++++++----- frame/3/trmm/bli_trmm_blk_var3b.h | 3 +- frame/3/trmm/bli_trmm_blk_var3f.c | 109 ++++++++----- frame/3/trmm/bli_trmm_blk_var3f.h | 3 +- frame/3/trmm/bli_trmm_front.c | 22 ++- frame/3/trmm/bli_trmm_int.c | 9 +- frame/3/trmm/bli_trmm_int.h | 3 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 222 ++++++++++++++------------- frame/3/trmm/bli_trmm_ll_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 24 ++- frame/3/trmm/bli_trmm_lu_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 23 ++- frame/3/trmm/bli_trmm_rl_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 23 ++- frame/3/trmm/bli_trmm_ru_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_threading.c | 173 +++++++++++++++++++++ frame/3/trmm/bli_trmm_threading.h | 79 ++++++++++ frame/3/trmm3/bli_trmm3_front.c | 22 ++- frame/base/bli_threading.c | 9 +- frame/base/bli_threading.h | 4 +- 40 files changed, 897 insertions(+), 403 deletions(-) create mode 100644 frame/3/trmm/bli_trmm_threading.c create mode 100644 frame/3/trmm/bli_trmm_threading.h diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index d8c84425b..7a5caf7de 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -263,18 +263,14 @@ void PASTEMAC(ch,varname )( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -323,6 +319,8 @@ void PASTEMAC(ch,varname )( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk)( strucc, \ diagoffp_i, \ diagc, \ @@ -336,6 +334,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + }\ \ \ p_inc = ldp * panel_len_max_i; \ @@ -349,6 +348,8 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk)( strucc, \ diagoffc_i, \ uploc, \ @@ -360,6 +361,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -373,6 +375,8 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -384,10 +388,13 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ -\ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ - } \ + } \ +\ +\ + p_begin += p_inc; \ } \ \ \ diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index bf93341c6..2d69e51d7 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ diagoffp_i, \ diagc, \ @@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -399,6 +398,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ diagoffc_i, \ uploc, \ @@ -411,6 +412,7 @@ void PASTEMAC(ch,varname)( \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ \ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ } \ @@ -423,6 +425,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -434,6 +438,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -448,6 +453,8 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ +\ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index 3d1ab78dc..8cfd49afa 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ diagoffp_i, \ diagc, \ @@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ p_inc = ldp * panel_len_max_i; \ \ @@ -406,6 +405,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ diagoffc_i, \ uploc, \ @@ -417,6 +418,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -430,6 +432,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -441,6 +445,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -463,6 +468,8 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ +\ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_threading.h b/frame/1m/packm/bli_packm_threading.h index 12be0c9cd..0d6fce2e4 100644 --- a/frame/1m/packm/bli_packm_threading.h +++ b/frame/1m/packm/bli_packm_threading.h @@ -44,6 +44,8 @@ struct packm_thrinfo_s //implements thrinfo_t }; typedef struct packm_thrinfo_s packm_thrinfo_t; +#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 368c303cf..c3e5db6c0 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; - bli_get_range( thread, m_trans, 8, &start, &end ); + bli_get_range( thread, 0, m_trans, 8, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index 66f2ce70b..82aad8b3d 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, n_trans, 8, &start, &end ); + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 9d1a7ea5c..c3a708211 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -85,7 +85,7 @@ void bli_hemm_front( side_t side, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_gemm_int, + (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 6d019fe57..b8329cf5b 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -116,7 +116,7 @@ void bli_her2k_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &bh_local, @@ -126,7 +126,7 @@ void bli_her2k_front( obj_t* alpha, (void**) infos ); bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, &alpha_conj, &b_local, &ah_local, diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 88671b99f..899aa194c 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range( thread, m_trans, 8, &start, &end ); + bli_get_range( thread, 0, m_trans, 8, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index a1fba63f4..3ef777247 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -90,7 +90,7 @@ void bli_herk_blk_var2f( obj_t* a, dim_t start, end; // Needs to be replaced with a weighted range because triangle - bli_get_range( thread, n_trans, 8, &start, &end ); + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 33c36fd3b..6139478ea 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -82,7 +82,7 @@ void bli_herk_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &ah_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index cce25b4c8..ed0c44664 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -84,7 +84,7 @@ void bli_symm_front( side_t side, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_gemm_int, + (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index fb5d4f0f6..f1ce3e279 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -98,7 +98,7 @@ void bli_syr2k_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &bt_local, @@ -108,7 +108,7 @@ void bli_syr2k_front( obj_t* alpha, (void**) infos ); bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &b_local, &at_local, diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index d9039cdb0..c5ac22797 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -78,7 +78,7 @@ void bli_syrk_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &at_local, diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 23238a089..ac1973366 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -37,21 +37,48 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1, c1_pack; + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -66,25 +93,14 @@ void bli_trmm_blk_var1f( obj_t* a, m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) + bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -94,38 +110,55 @@ void bli_trmm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( b_pack ); + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.h b/frame/3/trmm/bli_trmm_blk_var1f.h index c9fc004f7..63994a9a6 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.h +++ b/frame/3/trmm/bli_trmm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 0c98da8e6..2a211bdbc 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -37,43 +37,58 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, n_trans, b, + b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -83,38 +98,55 @@ void bli_trmm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2b.h b/frame/3/trmm/bli_trmm_blk_var2b.h index e8d54ecdb..afb9f9903 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.h +++ b/frame/3/trmm/bli_trmm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 14571322b..f1ccedd45 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -37,43 +37,58 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -83,38 +98,55 @@ void bli_trmm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2f.h b/frame/3/trmm/bli_trmm_blk_var2f.h index 148bbd234..8c47d55b8 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.h +++ b/frame/3/trmm/bli_trmm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 11b3dc551..40e9e21d6 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -37,38 +37,50 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,38 +95,51 @@ void bli_trmm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3b.h b/frame/3/trmm/bli_trmm_blk_var3b.h index bcd4c8c4b..e3a5bfbb3 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.h +++ b/frame/3/trmm/bli_trmm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 59050423c..80293e42f 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -37,38 +37,50 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,38 +95,51 @@ void bli_trmm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3f.h b/frame/3/trmm/bli_trmm_blk_var3f.h index 4be2c7b3c..6f9338cbb 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.h +++ b/frame/3/trmm/bli_trmm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 1911ba3be..644f27d4b 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -125,12 +125,20 @@ void bli_trmm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos ); } diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 287205873..56327008b 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -173,6 +175,7 @@ void bli_trmm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/bli_trmm_int.h index 18c2d0da0..70d8b551e 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/bli_trmm_int.h @@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 01fc281ee..99e0dcec7 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -270,9 +274,12 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ - { \ + for ( j = 0; j < n_iter; ++j ) { \ +\ + if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ +\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ @@ -307,121 +314,124 @@ void PASTEMAC(ch,varname)( \ off_a1011 = 0; \ k_a1011 = diagoffa_i + MR; \ \ - b1_i = b1 + off_a1011 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * ss_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ -\ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + b1_i = b1 + off_a1011 * PACKNR; \ \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + k_a1011 * ss_a; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ + \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ + \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ + \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ + \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux ); \ + \ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += k_a1011 * ss_a; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ \ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ \ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( rstep_a, aux ); \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( rstep_a, aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux ); \ \ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += rstep_a; \ } \ -\ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.h b/frame/3/trmm/bli_trmm_ll_ker_var2.h index eb9cb1cc5..9710adc7c 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 867809da0..0622bbbb2 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ @@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ @@ -315,6 +321,7 @@ void PASTEMAC(ch,varname)( \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ @@ -369,11 +376,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += k_a1112 * ss_a; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ @@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.h b/frame/3/trmm/bli_trmm_lu_ker_var2.h index 3ba1f0ca7..508612a90 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.h +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index ae4b4b1d2..f48baf4b3 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b1121 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.h b/frame/3/trmm/bli_trmm_rl_ker_var2.h index 3059aaaa9..d1e998bf6 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.h +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 57d112ce5..d9a28f86d 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b0111 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.h b/frame/3/trmm/bli_trmm_ru_ker_var2.h index 93c22402f..cb4a7b937 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c new file mode 100644 index 000000000..0a9d83da2 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trmm = sub_trmm; +} + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trmm = thread; +} + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) ); + bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trmm ); + return thread; +} + +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads ) +{ +} + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) +{ + dim_t jc_way = read_env( "BLIS_JC_NT" ); + dim_t kc_way = read_env( "BLIS_KC_NT" ); + dim_t ic_way = read_env( "BLIS_IC_NT" ); + dim_t jr_way = read_env( "BLIS_JR_NT" ); + dim_t ir_way = read_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trmm/bli_trmm_threading.h b/frame/3/trmm/bli_trmm_threading.h new file mode 100644 index 000000000..376608261 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trmm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trmm_thrinfo_s* sub_trmm; +}; +typedef struct trmm_thrinfo_s trmm_thrinfo_t; + +#define trmm_thread_sub_trmm( thread ) thread->sub_trmm +#define trmm_thread_sub_opackm( thread ) thread->opackm +#define trmm_thread_sub_ipackm( thread ) thread->ipackm + +#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ); +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** ); + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 1d4a68918..080b9a399 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,12 +127,20 @@ void bli_trmm3_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos ); } diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index d612210b2..c0ef641ea 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -216,17 +216,18 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) return info_paths; } */ -void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) +void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ) { thrinfo_t* thread = (thrinfo_t*) thr; - dim_t n_way = thread->n_way; dim_t work_id = thread->work_id; + + dim_t size = all_end - all_start; dim_t n_pt = size / n_way; n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt; n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); - *start = work_id * n_pt; - *end = bli_min( *start + n_pt, size ); + *start = work_id * n_pt + all_start; + *end = bli_min( *start + n_pt, size + all_start ); } void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index bb2bd6ba3..daaf2d6f4 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -87,7 +87,8 @@ typedef struct thrinfo_s thrinfo_t; #define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) #define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) -void bli_get_range( void* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); + thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, @@ -98,6 +99,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_packm_threading.h" #include "bli_gemm_threading.h" #include "bli_herk_threading.h" +#include "bli_trmm_threading.h" typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); void bli_level3_thread_decorator( dim_t num_threads,