diff --git a/config/bgq/bli_config.h b/config/bgq/bli_config.h index 39a627a6a..234f39648 100644 --- a/config/bgq/bli_config.h +++ b/config/bgq/bli_config.h @@ -111,16 +111,16 @@ // Alignment size used when allocating memory dynamically from the operating // system (eg: posix_memalign()). To disable heap alignment and just use // malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 32 +#define BLIS_HEAP_ADDR_ALIGN_SIZE 64 // Alignment size used when sizing leading dimensions of dynamically // allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE 32 +#define BLIS_HEAP_STRIDE_ALIGN_SIZE 64 // Alignment size used when allocating entire blocks of contiguous memory // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32 // -- MIXED DATATYPE SUPPORT --------------------------------------------------- @@ -154,12 +154,13 @@ // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 // Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ +// Underscore is left out to work on BGQ systems +#define PASTEF770(name) name //## _ +#define PASTEF77(ch1,name) ch1 ## name //## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name //## _ diff --git a/config/bgq/bli_kernel.h b/config/bgq/bli_kernel.h index 02d9c89b5..80065ec06 100644 --- a/config/bgq/bli_kernel.h +++ b/config/bgq/bli_kernel.h @@ -54,27 +54,22 @@ // (b) NR (for triangular operations such as trmm and trsm). // -#define BLIS_DEFAULT_MC_S 256 -#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_MC_S 1024 +#define BLIS_DEFAULT_KC_S 2048 #define BLIS_DEFAULT_NC_S 8192 -// 16 MPI RANKS CASE: -//#define BLIS_DEFAULT_MC_D 256//1024 -//#define BLIS_DEFAULT_KC_D 512//2048 -// - // 1 MPI RANK CASE: -#define BLIS_DEFAULT_MC_D 1008 -#define BLIS_DEFAULT_KC_D 2016 -#define BLIS_DEFAULT_NC_D 20480 +#define BLIS_DEFAULT_MC_D 1024 +#define BLIS_DEFAULT_KC_D 2048 +#define BLIS_DEFAULT_NC_D 10240 -#define BLIS_DEFAULT_MC_C 128 -#define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 4096 +#define BLIS_DEFAULT_MC_C 1024 +#define BLIS_DEFAULT_KC_C 2048 +#define BLIS_DEFAULT_NC_C 8192 -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 2048 +#define BLIS_DEFAULT_MC_Z 768 +#define BLIS_DEFAULT_KC_Z 1536 +#define BLIS_DEFAULT_NC_Z 10240 // -- Register blocksizes -- @@ -87,7 +82,7 @@ #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 -#define BLIS_DEFAULT_MR_Z 8 +#define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 // NOTE: If the micro-kernel, which is typically unrolled to a factor @@ -153,7 +148,7 @@ // -- Default fusing factors for level-1f operations -- #define BLIS_L1F_FUSE_FAC_S 8 -#define BLIS_L1F_FUSE_FAC_D 4 +#define BLIS_L1F_FUSE_FAC_D 8 #define BLIS_L1F_FUSE_FAC_C 4 #define BLIS_L1F_FUSE_FAC_Z 2 @@ -182,7 +177,7 @@ #include "bli_gemm_8x8.h" #define BLIS_DGEMM_UKERNEL bli_dgemm_8x8 -#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt +#define BLIS_ZGEMM_UKERNEL bli_zgemm_8x8 // -- trsm-related -- diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index 637e71f74..688ed75ec 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -36,6 +36,9 @@ #define BLIS_CONFIG_H +#define BLIS_TREE_BARRIER +#define BLIS_TREE_BARRIER_ARITY 4 + // -- OPERATING SYSTEM --------------------------------------------------------- diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index c894bc638..b09bea493 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate # --- Determine the C compiler and related flags --- CC := icc CPPROCFLAGS := -CMISCFLAGS := -mmic -fasm-blocks -std=c99 +CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp CDBGFLAGS := CWARNFLAGS := -Wall COPTFLAGS := -O3 @@ -98,7 +98,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -mmic -lm +LDFLAGS := -mmic -lm -openmp diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 60a7d6894..c666a9b44 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); void bli_packm_blk_var1( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -117,31 +119,33 @@ void bli_packm_blk_var1( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - bool_t invdiag, \ - bool_t revifup, \ - bool_t reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ - ) \ +void PASTEMAC(ch,varname )( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + bool_t invdiag, \ + bool_t revifup, \ + bool_t reviflo, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ + ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ @@ -260,7 +264,7 @@ void PASTEMAC(ch,varname)( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ @@ -315,6 +319,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk)( strucc, \ diagoffp_i, \ diagc, \ @@ -328,6 +334,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + }\ \ \ p_inc = ldp * panel_len_max_i; \ @@ -341,6 +348,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk)( strucc, \ diagoffc_i, \ uploc, \ @@ -352,6 +361,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -365,6 +375,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -376,13 +388,13 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ -\ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ - } \ + } \ \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ \ \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 5a2c356a5..e4cd44e78 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var1( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROT @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( packm_blk_var1 ) diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index fdd25a1a1..2d69e51d7 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); void bli_packm_blk_var3( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -98,7 +100,7 @@ void bli_packm_blk_var3( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p ); + bli_packm_blk_var1( c, p, t ); return; } @@ -109,23 +111,26 @@ void bli_packm_blk_var3( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if ( thread_am_ochief( t ) ) { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. @@ -154,7 +159,8 @@ void bli_packm_blk_var3( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ @@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ diagoffp_i, \ diagc, \ @@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -388,6 +398,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ diagoffc_i, \ uploc, \ @@ -400,6 +412,7 @@ void PASTEMAC(ch,varname)( \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ \ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ } \ @@ -412,6 +425,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -423,6 +438,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -438,7 +454,7 @@ void PASTEMAC(ch,varname)( \ \ } \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h index 6189d2415..b1d684262 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var3( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var3 ) diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index 6caa3448e..8cfd49afa 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4); void bli_packm_blk_var4( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -98,7 +100,7 @@ void bli_packm_blk_var4( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p ); + bli_packm_blk_var1( c, p, t ); return; } @@ -109,23 +111,26 @@ void bli_packm_blk_var4( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if( thread_am_ochief( t ) ) { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. @@ -154,7 +159,8 @@ void bli_packm_blk_var4( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ @@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ diagoffp_i, \ diagc, \ @@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ p_inc = ldp * panel_len_max_i; \ \ @@ -395,6 +405,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ diagoffc_i, \ uploc, \ @@ -406,6 +418,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -419,6 +432,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -430,6 +445,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -453,7 +469,7 @@ void PASTEMAC(ch,varname)( \ \ } \ \ - p_begin += p_inc; \ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h index e13e5fe33..e727873e4 100644 --- a/frame/1m/packm/bli_packm_blk_var4.h +++ b/frame/1m/packm/bli_packm_blk_var4.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var4( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* t \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var4 ) diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index a7bf3ce61..a3d89b679 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -37,7 +37,8 @@ #define FUNCPTR_T packm_fp typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); static FUNCPTR_T vars[6][3] = { @@ -52,7 +53,8 @@ static FUNCPTR_T vars[6][3] = void bli_packm_int( obj_t* a, obj_t* p, - packm_t* cntl ) + packm_t* cntl, + packm_thrinfo_t* thread ) { varnum_t n; impl_t i; @@ -119,6 +121,10 @@ void bli_packm_int( obj_t* a, // Invoke the variant with kappa_use. f( a, - p ); + p, + thread ); + + // Barrier so that packing is done before computation + thread_obarrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 923dcbc3c..1e6a122ac 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -34,5 +34,6 @@ void bli_packm_int( obj_t* a, obj_t* p, - packm_t* cntl ); + packm_t* cntl, + packm_thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_threading.c b/frame/1m/packm/bli_packm_threading.c new file mode 100644 index 000000000..098475c5e --- /dev/null +++ b/frame/1m/packm/bli_packm_threading.c @@ -0,0 +1,64 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_packm_thrinfo_free( packm_thrinfo_t* thread ) +{ + //Assume that the ocomm and the icomm are freed by something else and don't need to be freed. + bli_free(thread); +} + +packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + return (packm_thrinfo_t*) bli_create_thread_info( ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); +} + +void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + bli_setup_thread_info( (thrinfo_t*) thread, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); +} + +void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; +} diff --git a/frame/1m/packm/bli_packm_threading.h b/frame/1m/packm/bli_packm_threading.h new file mode 100644 index 000000000..7b4dc0f22 --- /dev/null +++ b/frame/1m/packm/bli_packm_threading.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +struct packm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on +}; +typedef struct packm_thrinfo_s packm_thrinfo_t; + +#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +void bli_packm_thrinfo_free( packm_thrinfo_t* thread ); +packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index c7d85a78a..b344b93d6 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -56,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); void bli_packm_unb_var1( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* thread ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -98,20 +99,22 @@ void bli_packm_unb_var1( obj_t* c, // function pointer. f = ftypes[dt_cp]; - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - densify, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p ); + if( thread_am_ochief( thread ) ) { + // Invoke the function. + f( strucc, + diagoffc, + diagc, + uploc, + transc, + densify, + m_p, + n_p, + m_max_p, + n_max_p, + buf_kappa, + buf_c, rs_c, cs_c, + buf_p, rs_p, cs_p ); + } } diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h index 1f4c451bf..25e95994e 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_unb_var1.h @@ -33,7 +33,8 @@ */ void bli_packm_unb_var1( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* thread ); #undef GENTPROT diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index f94bbb423..70a520403 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -49,7 +49,8 @@ static FUNCPTR_T vars[2][3] = void bli_unpackm_int( obj_t* p, obj_t* a, - unpackm_t* cntl ) + unpackm_t* cntl, + packm_thrinfo_t* thread ) { // The unpackm operation consists of an optional post-process: castm. // (This post-process is analogous to the castm pre-process in packm.) @@ -122,9 +123,12 @@ void bli_unpackm_int( obj_t* p, f = vars[n][i]; // Invoke the variant. - f( p, - &c, - cntl ); + if( thread_am_ochief( thread ) ) { + f( p, + &c, + cntl ); + } + thread_obarrier( thread ); // Now, if necessary, we cast the contents of c to matrix a. If casting // was not necessary, then we are done because the call to the unpackm diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 11960817c..89b8489f6 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -34,7 +34,8 @@ void bli_unpackm_int( obj_t* p, obj_t* a, - unpackm_t* cntl ); + unpackm_t* cntl, + packm_thrinfo_t* thread ); /* void bli_unpackm_init_cast( obj_t* p, diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/bli_gemv_blk_var1.c index c4701ea6e..4f95118c2 100644 --- a/frame/2/gemv/bli_gemv_blk_var1.c +++ b/frame/2/gemv/bli_gemv_blk_var1.c @@ -76,7 +76,8 @@ void bli_gemv_blk_var1( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/bli_gemv_blk_var2.c index a9ea14856..4d6fdba0d 100644 --- a/frame/2/gemv/bli_gemv_blk_var2.c +++ b/frame/2/gemv/bli_gemv_blk_var2.c @@ -81,7 +81,8 @@ void bli_gemv_blk_var2( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/bli_ger_blk_var1.c index 4d92a941c..77b6ace11 100644 --- a/frame/2/ger/bli_ger_blk_var1.c +++ b/frame/2/ger/bli_ger_blk_var1.c @@ -75,7 +75,8 @@ void bli_ger_blk_var1( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); @@ -90,7 +91,8 @@ void bli_ger_blk_var1( obj_t* alpha, // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, - cntl_sub_unpackm_a( cntl ) ); + cntl_sub_unpackm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/bli_ger_blk_var2.c index 3855ca895..6405497aa 100644 --- a/frame/2/ger/bli_ger_blk_var2.c +++ b/frame/2/ger/bli_ger_blk_var2.c @@ -75,7 +75,8 @@ void bli_ger_blk_var2( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); @@ -90,7 +91,8 @@ void bli_ger_blk_var2( obj_t* alpha, // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, - cntl_sub_unpackm_a( cntl ) ); + cntl_sub_unpackm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/bli_hemv_blk_var1.c index 03d942970..5beb1be75 100644 --- a/frame/2/hemv/bli_hemv_blk_var1.c +++ b/frame/2/hemv/bli_hemv_blk_var1.c @@ -106,7 +106,8 @@ void bli_hemv_blk_var1( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/bli_hemv_blk_var2.c index 05211f522..371f53d82 100644 --- a/frame/2/hemv/bli_hemv_blk_var2.c +++ b/frame/2/hemv/bli_hemv_blk_var2.c @@ -109,7 +109,8 @@ void bli_hemv_blk_var2( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/bli_hemv_blk_var3.c index d7d8fbc33..072706300 100644 --- a/frame/2/hemv/bli_hemv_blk_var3.c +++ b/frame/2/hemv/bli_hemv_blk_var3.c @@ -106,7 +106,8 @@ void bli_hemv_blk_var3( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/bli_hemv_blk_var4.c index e8c5e739b..d4fc17324 100644 --- a/frame/2/hemv/bli_hemv_blk_var4.c +++ b/frame/2/hemv/bli_hemv_blk_var4.c @@ -109,7 +109,8 @@ void bli_hemv_blk_var4( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/bli_her_blk_var1.c index 64089bd6b..45fc9c1d4 100644 --- a/frame/2/her/bli_her_blk_var1.c +++ b/frame/2/her/bli_her_blk_var1.c @@ -90,7 +90,8 @@ void bli_her_blk_var1( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); @@ -112,7 +113,8 @@ void bli_her_blk_var1( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/bli_her_blk_var2.c index e88f2a29e..a856269b0 100644 --- a/frame/2/her/bli_her_blk_var2.c +++ b/frame/2/her/bli_her_blk_var2.c @@ -90,7 +90,8 @@ void bli_her_blk_var2( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); @@ -112,7 +113,8 @@ void bli_her_blk_var2( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/bli_her2_blk_var1.c index e8896d026..af15b674f 100644 --- a/frame/2/her2/bli_her2_blk_var1.c +++ b/frame/2/her2/bli_her2_blk_var1.c @@ -101,7 +101,8 @@ void bli_her2_blk_var1( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, @@ -136,7 +137,8 @@ void bli_her2_blk_var1( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/bli_her2_blk_var2.c index ca165872f..d57da2bff 100644 --- a/frame/2/her2/bli_her2_blk_var2.c +++ b/frame/2/her2/bli_her2_blk_var2.c @@ -104,7 +104,8 @@ void bli_her2_blk_var2( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, @@ -139,7 +140,8 @@ void bli_her2_blk_var2( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/bli_her2_blk_var3.c index e1ac9d555..8270f8dff 100644 --- a/frame/2/her2/bli_her2_blk_var3.c +++ b/frame/2/her2/bli_her2_blk_var3.c @@ -104,7 +104,8 @@ void bli_her2_blk_var3( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, @@ -139,7 +140,8 @@ void bli_her2_blk_var3( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/bli_her2_blk_var4.c index d0a8eaf9f..77b750230 100644 --- a/frame/2/her2/bli_her2_blk_var4.c +++ b/frame/2/her2/bli_her2_blk_var4.c @@ -101,7 +101,8 @@ void bli_her2_blk_var4( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, @@ -136,7 +137,8 @@ void bli_her2_blk_var4( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/bli_trmv_l_blk_var1.c index c9260d7a6..5550e9ee9 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var1.c +++ b/frame/2/trmv/bli_trmv_l_blk_var1.c @@ -80,7 +80,8 @@ void bli_trmv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/bli_trmv_l_blk_var2.c index dd6493069..1db28eb11 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var2.c +++ b/frame/2/trmv/bli_trmv_l_blk_var2.c @@ -80,7 +80,8 @@ void bli_trmv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/bli_trmv_u_blk_var1.c index e50293f9d..1e82157af 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var1.c +++ b/frame/2/trmv/bli_trmv_u_blk_var1.c @@ -80,7 +80,8 @@ void bli_trmv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/bli_trmv_u_blk_var2.c index d5c491daf..0c9ea6d0b 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var2.c +++ b/frame/2/trmv/bli_trmv_u_blk_var2.c @@ -80,7 +80,8 @@ void bli_trmv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/bli_trsv_l_blk_var1.c index 6ffdd541e..b7b7e382a 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var1.c +++ b/frame/2/trsv/bli_trsv_l_blk_var1.c @@ -85,7 +85,8 @@ void bli_trsv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/bli_trsv_l_blk_var2.c index 9740a9ec0..5e2718cb2 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var2.c +++ b/frame/2/trsv/bli_trsv_l_blk_var2.c @@ -85,7 +85,8 @@ void bli_trsv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/bli_trsv_u_blk_var1.c index 8d65e36fd..6f6c55558 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var1.c +++ b/frame/2/trsv/bli_trsv_u_blk_var1.c @@ -85,7 +85,8 @@ void bli_trsv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/bli_trsv_u_blk_var2.c index 16a167b17..7611e53dc 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var2.c +++ b/frame/2/trsv/bli_trsv_u_blk_var2.c @@ -85,7 +85,8 @@ void bli_trsv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index 5c19af504..c47985a3e 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -50,7 +50,6 @@ #include "bli_gemm4m.h" #include "bli_gemm3m.h" - // // Prototype object-based interface. // diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index e425f44fd..8288f3ac9 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -37,45 +37,64 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1, c1_pack; + //The s is for "lives on the stack" + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize objects passed into bli_packm_init for A and C + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range( thread, 0, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the m dimension. - for ( i = 0; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,38 +102,50 @@ void bli_gemm_blk_var1f( obj_t* a, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + + // Initialize objects for packing A1 and C1. + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_gemm( cntl ) ); + c1_pack, + cntl_sub_gemm( cntl ), + gemm_thread_sub_gemm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( b_pack ); + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.h b/frame/3/gemm/bli_gemm_blk_var1f.h index 4e5bfcf36..99548ac12 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.h +++ b/frame/3/gemm/bli_gemm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index c9f29ee7b..63c2f5824 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -37,45 +37,63 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -85,36 +103,48 @@ void bli_gemm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_gemm( cntl ) ); + c1_pack, + cntl_sub_gemm( cntl ), + gemm_thread_sub_gemm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var2f.h b/frame/3/gemm/bli_gemm_blk_var2f.h index 01a4c175a..73dea73b8 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.h +++ b/frame/3/gemm/bli_gemm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 97bcd5d87..f1114daaf 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -37,37 +37,50 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -85,26 +98,32 @@ void bli_gemm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_gemm( cntl ) ); + c_pack, + cntl_sub_gemm( cntl ), + gemm_thread_sub_gemm( thread) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it @@ -112,17 +131,25 @@ void bli_gemm_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( &c_pack ); + if ( i == 0 ) thread_ibarrier( thread ); + if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( c_pack ); + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var3f.h b/frame/3/gemm/bli_gemm_blk_var3f.h index ba6716215..cdd655c2f 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.h +++ b/frame/3/gemm/bli_gemm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 863177dae..bb868b7a9 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,7 +55,6 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; - void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index d139f6b3c..01b8eaab7 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -74,12 +74,20 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index ad9139547..d2cc838e1 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); static FUNCPTR_T vars[6][3] = { @@ -57,7 +58,8 @@ void bli_gemm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -73,7 +75,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -86,7 +90,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -104,22 +110,24 @@ void bli_gemm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + //if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. @@ -133,6 +141,7 @@ void bli_gemm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 5181f3253..bfefe30c7 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -37,5 +37,6 @@ void bli_gemm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index b800c0afe..2d5cc7bca 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -45,7 +45,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + gemm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); @@ -54,7 +55,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -117,7 +119,8 @@ void bli_gemm_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -133,7 +136,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + gemm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -213,18 +217,21 @@ void PASTEMAC(ch,varname)( \ bli_auxinfo_set_ps_a( ps_a, aux ); \ bli_auxinfo_set_ps_b( ps_b, aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ -\ - a1 = a_cast; \ - c11 = c1; \ + \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -232,18 +239,21 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -282,13 +292,7 @@ void PASTEMAC(ch,varname)( \ beta_cast, \ c11, rs_c, cs_c ); \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ diff --git a/frame/3/gemm/bli_gemm_ker_var2.h b/frame/3/gemm/bli_gemm_ker_var2.h index 62ebd6041..ca5ac1eff 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.h +++ b/frame/3/gemm/bli_gemm_ker_var2.h @@ -39,7 +39,8 @@ void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); // @@ -57,7 +58,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + gemm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( gemm_ker_var2 ) diff --git a/frame/3/gemm/bli_gemm_ker_var5.c b/frame/3/gemm/bli_gemm_ker_var5.c index 2e4599995..33d245780 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.c +++ b/frame/3/gemm/bli_gemm_ker_var5.c @@ -54,7 +54,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5); void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + gemm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_ker_var5.h b/frame/3/gemm/bli_gemm_ker_var5.h index 6c79226ea..52a237bbc 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.h +++ b/frame/3/gemm/bli_gemm_ker_var5.h @@ -39,7 +39,8 @@ void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + gemm_thrinfo_t* thread ); // diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c new file mode 100644 index 000000000..3056f019f --- /dev/null +++ b/frame/3/gemm/bli_gemm_threading.c @@ -0,0 +1,203 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_gemm = sub_gemm; +} + +void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_gemm = thread; +} + +gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ) +{ + gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc( sizeof( gemm_thrinfo_t ) ); + bli_setup_gemm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_gemm ); + return thread; +} + +void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread) +{ + if( thread == NULL || thread == &BLIS_GEMM_SINGLE_THREADED ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_gemm_thrinfo_free( thread->sub_gemm ); + bli_free( thread ); + + return; +} +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_gemm_thrinfo_free( threads[i] ); + bli_free( threads ); +} + +gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) +{ + +#ifdef BLIS_ENABLE_MULTITHREADING + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif + + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + gemm_thrinfo_t** paths = (gemm_thrinfo_t**) malloc( global_num_threads * sizeof( gemm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + gemm_thrinfo_t* ir_info = bli_create_gemm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + gemm_thrinfo_t* jr_info = bli_create_gemm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + gemm_thrinfo_t* ic_info = bli_create_gemm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + gemm_thrinfo_t* kc_info = bli_create_gemm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + gemm_thrinfo_t* jc_info = bli_create_gemm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h new file mode 100644 index 000000000..b789ba09b --- /dev/null +++ b/frame/3/gemm/bli_gemm_threading.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct gemm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct gemm_thrinfo_s* sub_gemm; +}; +typedef struct gemm_thrinfo_s gemm_thrinfo_t; + +#define gemm_thread_sub_gemm( thread ) thread->sub_gemm +#define gemm_thread_sub_opackm( thread ) thread->opackm +#define gemm_thread_sub_ipackm( thread ) thread->ipackm + +// For use in gemm micro-kernel +#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + +gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ); +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads ); + +void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ); + +gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ); + +void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread ); diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index eaaee7b2c..7848e1117 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -80,12 +80,20 @@ void bli_hemm_front( side_t side, bli_obj_swap( a_local, b_local ); } - // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index bfcd076cf..01afc70dc 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -109,20 +109,34 @@ void bli_her2k_front( obj_t* alpha, &c_local, cntl ); #else - // Invoke herk twice, using beta only the first time. - bli_herk_int( alpha, - &a_local, - &bh_local, - beta, - &c_local, - cntl ); - bli_herk_int( &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - cntl ); + // Invoke herk twice, using beta only the first time. + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + alpha, + &a_local, + &bh_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + &alpha_conj, + &b_local, + &ah_local, + &BLIS_ONE, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos, n_threads ); + #endif } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index f3c7c31bc..fbee3a750 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -37,42 +37,60 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah_pack; - obj_t c1, c1_pack; + obj_t ah_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack; + obj_t* c1_pack; + obj_t* ah_pack; dim_t i; dim_t b_alg; dim_t m_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A'. + bli_obj_init_pack( &ah_pack_s ); + bli_packm_init( ah, &ah_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + ah_pack = thread_obroadcast( thread, &ah_pack_s ); + + // Initialize pack objects that are passed into packm_init() for A and C. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A' (if instructed). + bli_packm_int( ah, ah_pack, + cntl_sub_packm_b( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A'. - bli_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack A' (if instructed). - bli_packm_int( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, 0, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. - for ( i = 0; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -82,36 +100,47 @@ void bli_herk_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + herk_thread_sub_ipackm( thread ) ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah_pack, + a1_pack, + ah_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_herk( cntl ) ); + c1_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( ah_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var1f.h b/frame/3/herk/bli_herk_blk_var1f.h index dfcae5c99..2a1b85f6e 100644 --- a/frame/3/herk/bli_herk_blk_var1f.h +++ b/frame/3/herk/bli_herk_blk_var1f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 4b4f77df3..f8fc666ba 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -37,49 +37,68 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a_pack, aS_pack; - obj_t ah1, ah1_pack; - obj_t c1; - obj_t c1S, c1S_pack; + obj_t a_pack_s; + obj_t ah1_pack_s, c1S_pack_s; + + obj_t ah1, c1, c1S; + obj_t aS_pack; + obj_t* a_pack; + obj_t* ah1_pack; + obj_t* c1S_pack; dim_t i; dim_t b_alg; dim_t n_trans; subpart_t stored_part; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c1S_pack ); - // The upper and lower variants are identical, except for which // merged subpartition is acquired in the loop body. if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B; else stored_part = BLIS_SUBPART1T; - // Query dimension in partitioning direction. - n_trans = bli_obj_width_after_trans( *c ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize pack objects for C and A' that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &ah1_pack_s ); + bli_obj_init_pack( &c1S_pack_s ); + } + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + c1S_pack = thread_ibroadcast( thread, &c1S_pack_s ); // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + herk_thread_sub_opackm( thread ) ); + + // Query dimension in partitioning direction. + n_trans = bli_obj_width_after_trans( *c ); + dim_t start, end; + + // Needs to be replaced with a weighted range because triangle + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1' and C1. @@ -90,42 +109,53 @@ void bli_herk_blk_var2f( obj_t* a, // Partition off the stored region of C1 and the corresponding region // of A_pack. - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &c1, &c1S ); - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &a_pack, &aS_pack ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, &c1, &c1S ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, a_pack, &aS_pack ); // Initialize objects for packing A1' and C1. - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1S, &c1S_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1S, c1S_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ) ; // Pack A1' (if instructed). - bli_packm_int( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ), + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1S, &c1S_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1S, c1S_pack, + cntl_sub_packm_c( cntl ), + herk_thread_sub_ipackm( thread ) ) ; // Perform herk subproblem. bli_herk_int( &BLIS_ONE, &aS_pack, - &ah1_pack, + ah1_pack, &BLIS_ONE, - &c1S_pack, - cntl_sub_herk( cntl ) ); + c1S_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1S_pack, &c1S, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c1S_pack, &c1S, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c1S_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( ah1_pack ); + bli_obj_release_pack( c1S_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var2f.h b/frame/3/herk/bli_herk_blk_var2f.h index 4932535d1..1d405f214 100644 --- a/frame/3/herk/bli_herk_blk_var2f.h +++ b/frame/3/herk/bli_herk_blk_var2f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 436378e80..943109156 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -37,37 +37,50 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah1, ah1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, ah1_pack_s; + + obj_t a1, ah1; + obj_t* a1_pack = NULL; + obj_t* ah1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing C. + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &ah1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -82,44 +95,59 @@ void bli_herk_blk_var3f( obj_t* a, i, b_alg, ah, &ah1 ); // Initialize objects for packing A1 and A1'. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + herk_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ), + herk_thread_sub_ipackm( thread ) ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah1_pack, + a1_pack, + ah1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_herk( cntl ) ); + c_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c_pack is a local obj_t, we can simply overwrite the + // internal beta scalar with BLIS_ONE once it has been used in the + // first iteration. + if ( i == 0 ) thread_ibarrier( thread ); + if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( &c_pack ); } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( ah1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var3f.h b/frame/3/herk/bli_herk_blk_var3f.h index b77ebc33f..22093d421 100644 --- a/frame/3/herk/bli_herk_blk_var3f.h +++ b/frame/3/herk/bli_herk_blk_var3f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 456cf84a8..6fb092460 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,12 +77,20 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - // Invoke the internal back-end. - bli_herk_int( alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntl ); + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + alpha, + &a_local, + &ah_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index 725ba80fb..bdd869093 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); static FUNCPTR_T vars[2][4][3] = { @@ -66,7 +67,8 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { obj_t a_local; obj_t ah_local; @@ -83,7 +85,9 @@ void bli_herk_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -105,22 +109,22 @@ void bli_herk_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set a bool based on the uplo field of C's root object. @@ -138,6 +142,7 @@ void bli_herk_int( obj_t* alpha, f( &a_local, &ah_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/bli_herk_int.h index 1b1973b3e..a3fa6343d 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/bli_herk_int.h @@ -37,5 +37,6 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 8afcf5124..464e54588 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_l_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_l_ker_var2.h b/frame/3/herk/bli_herk_l_ker_var2.h index 5dd906db9..09f1c7b31 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.h +++ b/frame/3/herk/bli_herk_l_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c new file mode 100644 index 000000000..b0def6f3f --- /dev/null +++ b/frame/3/herk/bli_herk_threading.c @@ -0,0 +1,203 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_herk = sub_herk; +} + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_herk = thread; +} + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) ); + bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_herk ); + return thread; +} + +void bli_herk_thrinfo_free( herk_thrinfo_t* thread) +{ + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_herk_thrinfo_free( thread->sub_herk ); + bli_free( thread ); + + return; +} +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_herk_thrinfo_free( threads[i] ); + bli_free( threads ); +} + +herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) +{ + +#ifdef BLIS_ENABLE_MULTITHREADING + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + herk_thrinfo_t** paths = (herk_thrinfo_t**) malloc( global_num_threads * sizeof( herk_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h new file mode 100644 index 000000000..33a04ff8b --- /dev/null +++ b/frame/3/herk/bli_herk_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct herk_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct herk_thrinfo_s* sub_herk; +}; +typedef struct herk_thrinfo_s herk_thrinfo_t; + +#define herk_thread_sub_herk( thread ) thread->sub_herk +#define herk_thread_sub_opackm( thread ) thread->opackm +#define herk_thread_sub_ipackm( thread ) thread->ipackm + +// For use in herk micro-kernel +#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + + +herk_thrinfo_t** bli_create_herk_thrinfo_paths( ); +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads ); + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 9c3d6cf06..694f8a211 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_u_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_u_ker_var2.h b/frame/3/herk/bli_herk_u_ker_var2.h index c6555bc27..481947b8e 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.h +++ b/frame/3/herk/bli_herk_u_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 303154caa..796ad5196 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -79,12 +79,20 @@ void bli_symm_front( side_t side, bli_obj_swap( a_local, b_local ); } - // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 4fa89654b..eceaf1913 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -93,19 +93,31 @@ void bli_syr2k_front( obj_t* alpha, cntl ); #else // Invoke herk twice, using beta only the first time. - bli_herk_int( alpha, - &a_local, - &bt_local, - beta, - &c_local, - cntl ); + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); - bli_herk_int( alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - cntl ); + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + alpha, + &a_local, + &bt_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + alpha, + &b_local, + &at_local, + &BLIS_ONE, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos, n_threads ); #endif } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index cc2f8d15a..977a91cd8 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -72,13 +72,21 @@ void bli_syrk_front( obj_t* alpha, { bli_obj_induce_trans( c_local ); } + + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_herk_int( alpha, - &a_local, - &at_local, - beta, - &c_local, - cntl ); + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_herk_int, + alpha, + &a_local, + &at_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index a21ff4876..c6cd75421 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -37,21 +37,48 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1, c1_pack; + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -66,24 +93,16 @@ void bli_trmm_blk_var1f( obj_t* a, m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) + bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -93,36 +112,47 @@ void bli_trmm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); - + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); + // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( b_pack ); + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.h b/frame/3/trmm/bli_trmm_blk_var1f.h index c9fc004f7..63994a9a6 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.h +++ b/frame/3/trmm/bli_trmm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 9ff3f0af7..64b33f310 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -37,42 +37,60 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, n_trans, b, + b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -82,36 +100,47 @@ void bli_trmm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2b.h b/frame/3/trmm/bli_trmm_blk_var2b.h index e8d54ecdb..afb9f9903 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.h +++ b/frame/3/trmm/bli_trmm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 35a665ab6..8adaf2b57 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -37,42 +37,60 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -82,36 +100,47 @@ void bli_trmm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2f.h b/frame/3/trmm/bli_trmm_blk_var2f.h index 148bbd234..8c47d55b8 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.h +++ b/frame/3/trmm/bli_trmm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 91921965c..6a1191936 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -37,37 +37,50 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -82,36 +95,49 @@ void bli_trmm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ochief( thread ) ){ + bli_obj_release_pack( c_pack ); + } + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3b.h b/frame/3/trmm/bli_trmm_blk_var3b.h index bcd4c8c4b..e3a5bfbb3 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.h +++ b/frame/3/trmm/bli_trmm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 2f3d6fd46..67a4aa880 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -37,37 +37,50 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -82,36 +95,49 @@ void bli_trmm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ochief( thread ) ){ + bli_obj_release_pack( c_pack ); + } + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3f.h b/frame/3/trmm/bli_trmm_blk_var3f.h index 4be2c7b3c..6f9338cbb 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.h +++ b/frame/3/trmm/bli_trmm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 1911ba3be..dea8b7771 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -125,12 +125,20 @@ void bli_trmm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( !bli_is_left( side ) ); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 70397a0dc..6e65db401 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -105,7 +107,9 @@ void bli_trmm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -127,22 +131,22 @@ void bli_trmm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -173,6 +177,7 @@ void bli_trmm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/bli_trmm_int.h index 18c2d0da0..70d8b551e 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/bli_trmm_int.h @@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 927357105..b5950a603 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -271,9 +275,12 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ - { \ + for ( j = 0; j < n_iter; ++j ) { \ +\ + if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ +\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ @@ -308,121 +315,124 @@ void PASTEMAC(ch,varname)( \ off_a1011 = 0; \ k_a1011 = diagoffa_i + MR; \ \ - b1_i = b1 + off_a1011 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * ss_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ -\ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + b1_i = b1 + off_a1011 * PACKNR; \ \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ + \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ + \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ + \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ + \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux ); \ + \ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += k_a1011 * ss_a; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ \ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ \ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( rstep_a, aux ); \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( rstep_a, aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux ); \ \ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += rstep_a; \ } \ -\ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.h b/frame/3/trmm/bli_trmm_ll_ker_var2.h index eb9cb1cc5..9710adc7c 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 867809da0..e4568c70c 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ @@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ @@ -315,14 +321,15 @@ void PASTEMAC(ch,varname)( \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1112 * ss_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -369,19 +376,20 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += k_a1112 * ss_a; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.h b/frame/3/trmm/bli_trmm_lu_ker_var2.h index 3ba1f0ca7..508612a90 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.h +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index ae4b4b1d2..133c0d8ed 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ - inc_t cstep_b; \ + /*inc_t cstep_b; */\ inc_t rstep_c, cstep_c; \ inc_t ss_b; \ auxinfo_t aux; \ @@ -267,7 +271,7 @@ void PASTEMAC(ch,varname)( \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ - cstep_b = ps_b; \ + /*cstep_b = ps_b; */\ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ @@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b1121 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b1121 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b1121 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.h b/frame/3/trmm/bli_trmm_rl_ker_var2.h index 3059aaaa9..d1e998bf6 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.h +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index d6498f180..cb5ef580f 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \ dim_t off_b0111; \ dim_t i, j; \ inc_t rstep_a; \ - inc_t cstep_b; \ + /*inc_t cstep_b; */\ inc_t rstep_c, cstep_c; \ inc_t ss_b; \ auxinfo_t aux; \ @@ -268,7 +272,7 @@ void PASTEMAC(ch,varname)( \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ - cstep_b = ps_b; \ + /*cstep_b = ps_b; */\ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ @@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b0111 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b0111 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b0111 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.h b/frame/3/trmm/bli_trmm_ru_ker_var2.h index 93c22402f..cb4a7b937 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c new file mode 100644 index 000000000..03dca3a10 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.c @@ -0,0 +1,207 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trmm = sub_trmm; +} + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trmm = thread; +} + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) ); + bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trmm ); + return thread; +} + +void bli_trmm_thrinfo_free( trmm_thrinfo_t* thread) +{ + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_trmm_thrinfo_free( thread->sub_trmm ); + bli_free( thread ); + + return; +} + +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_trmm_thrinfo_free( threads[i] ); + bli_free( threads ); +} + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + + if( jc_dependency ){ + jr_way *= jc_way; + jc_way = 1; + } +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trmm/bli_trmm_threading.h b/frame/3/trmm/bli_trmm_threading.h new file mode 100644 index 000000000..dadc65912 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trmm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trmm_thrinfo_s* sub_trmm; +}; +typedef struct trmm_thrinfo_s trmm_thrinfo_t; + +#define trmm_thread_sub_trmm( thread ) thread->sub_trmm +#define trmm_thread_sub_opackm( thread ) thread->opackm +#define trmm_thread_sub_ipackm( thread ) thread->ipackm + +#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ); +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads ); + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 1d4a68918..16c41154d 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,12 +127,20 @@ void bli_trmm3_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 82158b707..d2037c202 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -60,21 +79,18 @@ void bli_trsm_blk_var1b( obj_t* a, // A begins. if ( bli_obj_is_upper( *a ) ) offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) - - bli_obj_width_after_trans( *a ); + bli_obj_width_after_trans( *a ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, m_trans, a, + b_alg = bli_determine_blocksize_b( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,28 +99,34 @@ void bli_trsm_blk_var1b( obj_t* a, bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); - //if ( bli_obj_is_zeros( a1 ) ) continue; - // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1b.h b/frame/3/trsm/bli_trsm_blk_var1b.h index 4ced0fc92..99585b947 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.h +++ b/frame/3/trsm/bli_trsm_blk_var1b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index faa49d25f..7072d0438 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -61,19 +80,16 @@ void bli_trsm_blk_var1f( obj_t* a, if ( bli_obj_is_lower( *a ) ) offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,25 +99,33 @@ void bli_trsm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1f.h b/frame/3/trsm/bli_trsm_blk_var1f.h index c815c03ff..48384c369 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.h +++ b/frame/3/trsm/bli_trsm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 970fc9307..2ee269cee 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -37,39 +37,58 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, n_trans, b, @@ -82,36 +101,47 @@ void bli_trsm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2b.h b/frame/3/trsm/bli_trsm_blk_var2b.h index fb352ce39..de4a8f899 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.h +++ b/frame/3/trsm/bli_trsm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index d1fe788da..41ccc668f 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -37,39 +37,58 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, n_trans, b, @@ -82,36 +101,47 @@ void bli_trsm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2f.h b/frame/3/trsm/bli_trsm_blk_var2f.h index 44eb38460..ade7f0bf4 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.h +++ b/frame/3/trsm/bli_trsm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index c699f110b..dd6b2c0c7 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -37,37 +37,51 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -82,43 +96,59 @@ void bli_trsm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 ) thread_ibarrier( thread ); + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3b.h b/frame/3/trsm/bli_trsm_blk_var3b.h index d8f6c8dc6..a1779dc67 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.h +++ b/frame/3/trsm/bli_trsm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 17aaa7204..466fd4461 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -37,37 +37,51 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -82,43 +96,59 @@ void bli_trsm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 ) thread_ibarrier( thread ); + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3f.h b/frame/3/trsm/bli_trsm_blk_var3f.h index 8546b0ba5..013d70bc1 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.h +++ b/frame/3/trsm/bli_trsm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 1dd67ece5..2c42c24f9 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -125,12 +125,20 @@ void bli_trsm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trsm_int( alpha, - &a_local, - &b_local, - alpha, - &c_local, - cntl ); + trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trsm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index b88659e2d..4377b97ce 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -105,7 +107,9 @@ void bli_trsm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -127,14 +131,17 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -150,7 +157,8 @@ void bli_trsm_int( obj_t* alpha, // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) @@ -164,10 +172,13 @@ void bli_trsm_int( obj_t* alpha, // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &a_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &a_local ); } } + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); @@ -179,6 +190,7 @@ void bli_trsm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index 504f7928c..62a937b3c 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -37,4 +37,5 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index bb0ed34db..0d31f656b 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.h b/frame/3/trsm/bli_trsm_ll_ker_var2.h index 59e8e576b..d13ab6f23 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index d86a87ca0..6d0efe5e8 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.h b/frame/3/trsm/bli_trsm_lu_ker_var2.h index 50b18cf79..c26d0081a 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.h +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 5d0288c40..3bc951bd5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.h b/frame/3/trsm/bli_trsm_rl_ker_var2.h index a0605a7b7..8cc3c5fed 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.h +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 9bac5c946..6711ba423 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.h b/frame/3/trsm/bli_trsm_ru_ker_var2.h index ebb24b81f..c07b215af 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c new file mode 100644 index 000000000..8d62a737b --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.c @@ -0,0 +1,199 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trsm = sub_trsm; +} + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trsm = thread; +} + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) ); + bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trsm ); + return thread; +} + +void bli_trsm_thrinfo_free( trsm_thrinfo_t* thread) +{ + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_trsm_thrinfo_free( thread->sub_trsm ); + bli_free( thread ); + + return; +} +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_trsm_thrinfo_free( threads[i] ); + bli_free( threads ); +} + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) +{ + /* + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + */ + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trsm_thrinfo_t** paths = (trsm_thrinfo_t**) malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trsm/bli_trsm_threading.h b/frame/3/trsm/bli_trsm_threading.h new file mode 100644 index 000000000..ad841331e --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trsm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trsm_thrinfo_s* sub_trsm; +}; +typedef struct trsm_thrinfo_s trsm_thrinfo_t; + +#define trsm_thread_sub_trsm( thread ) thread->sub_trsm +#define trsm_thread_sub_opackm( thread ) thread->opackm +#define trsm_thread_sub_ipackm( thread ) thread->ipackm + +#define trsm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ); +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** info, dim_t n_threads ); + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ); diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 752f23279..80eadd8e2 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -46,7 +46,10 @@ obj_t BLIS_MINUS_ONE_HALF; obj_t BLIS_MINUS_ONE; obj_t BLIS_MINUS_TWO; - +packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; +gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; +thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) { @@ -59,6 +62,11 @@ void bli_init( void ) bli_error_msgs_init(); bli_mem_init(); + + bli_setup_communicator( &BLIS_SINGLE_COMM, 1 ); + bli_setup_packm_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED ); + bli_setup_gemm_single_threaded_info( &BLIS_GEMM_SINGLE_THREADED ); + bli_setup_herk_single_threaded_info( &BLIS_HERK_SINGLE_THREADED ); } void bli_finalize( void ) diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 01f8b5eba..06688cacf 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -127,7 +127,10 @@ void bli_mem_acquire_m( siz_t req_size, // BEGIN CRITICAL SECTION - +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif + { // Query the index of the contiguous memory block that resides at the // "top" of the pool. @@ -145,6 +148,7 @@ void bli_mem_acquire_m( siz_t req_size, // END CRITICAL SECTION + } // Query the size of the blocks in the pool so we can store it in the // mem_t object. @@ -198,7 +202,10 @@ void bli_mem_release( mem_t* mem ) // BEGIN CRITICAL SECTION - +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif + { // Increment the top of the memory pool. bli_pool_inc_top_index( pool ); @@ -211,6 +218,7 @@ void bli_mem_release( mem_t* mem ) // END CRITICAL SECTION + } } diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c new file mode 100644 index 000000000..ca6503b8e --- /dev/null +++ b/frame/base/bli_threading.c @@ -0,0 +1,329 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_TREE_BARRIER +void bli_free_barrier_tree( barrier_t* barrier ) +{ + if( barrier == NULL ) + return; + barrier->count--; + if( barrier->count == 0 ) + { + bli_free_barrier_tree( barrier->dad ); + bli_free( barrier ); + } + return; +} +barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index) +{ + barrier_t* me = (barrier_t*) malloc(sizeof(barrier_t)); + + me->dad = NULL; + me->signal = 0; + + // Base Case + if( num_threads <= arity ) { + //Now must be registered as a leaf + for(int i = 0; i < num_threads; i++) + { + leaves[leaf_index + i] = me; + } + me->count = num_threads; + me->arity = num_threads; + } + else { + // Otherwise this node has children + int threads_per_kid = num_threads / arity; + int defecit = num_threads - threads_per_kid * arity; + + for(int i = 0; i < arity; i++){ + int threads_this_kid = threads_per_kid; + if(i < defecit) threads_this_kid++; + + barrier_t* kid = bli_create_tree_barrier(threads_this_kid, arity, leaves, leaf_index); + kid->dad = me; + + leaf_index += threads_this_kid; + } + me->count = arity; + me->arity = arity; + } + + return me; +} + +void bli_cleanup_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; + for( dim_t i = 0; i < communicator->n_threads; i++) + { + bli_free_barrier_tree( communicator->barriers[i] ); + } + bli_free( communicator->barriers ); +} +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) +{ + if( communicator == NULL ) return; + communicator->sent_object = NULL; + communicator->n_threads = n_threads; + communicator->barriers = ( barrier_t** ) bli_malloc( sizeof( barrier_t* ) * n_threads ); + bli_create_tree_barrier( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 ); +} + +void tree_barrier( barrier_t* barack ) +{ +#ifdef BLIS_ENABLE_OPENMP + int my_signal = barack->signal; + int my_count; + + _Pragma("omp atomic capture") + my_count = barack->count--; + + if( my_count == 1 ) { + if( barack->dad != NULL ) { + tree_barrier( barack->dad ); + } + barack->count = barack->arity; + barack->signal = !barack->signal; + } + else { + volatile int* listener = &barack->signal; + while( *listener == my_signal ) {} + } +#else + return +#endif +} + +void bli_barrier( thread_comm_t* comm, dim_t t_id ) +{ + tree_barrier( comm->barriers[t_id] ); +} + +#else + +void bli_cleanup_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; +} +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) +{ + if( communicator == NULL ) return; + communicator->sent_object = NULL; + communicator->n_threads = n_threads; + communicator->barrier_sense = 0; + communicator->barrier_threads_arrived = 0; +} +//barrier routine taken from art of multicore programming or something +void bli_barrier( thread_comm_t* communicator, dim_t t_id ) +{ +#ifdef BLIS_ENABLE_OPENMP + if(communicator == NULL || communicator->n_threads == 1) + return; + bool_t my_sense = communicator->barrier_sense; + dim_t my_threads_arrived; + + _Pragma("omp atomic capture") + my_threads_arrived = ++(communicator->barrier_threads_arrived); + + if( my_threads_arrived == communicator->n_threads ) { + communicator->barrier_threads_arrived = 0; + communicator->barrier_sense = !communicator->barrier_sense; + } + else { + volatile bool_t* listener = &communicator->barrier_sense; + while( *listener == my_sense ) {} + } +#else + return; +#endif +} +#endif + +void bli_free_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; + bli_cleanup_communicator( communicator ); + bli_free( communicator ); +} + +thread_comm_t* bli_create_communicator( dim_t n_threads ) +{ + thread_comm_t* comm = (thread_comm_t*) bli_malloc( sizeof(thread_comm_t) ); + bli_setup_communicator( comm, n_threads ); + return comm; +} + +void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_send ) +{ + if( communicator == NULL || communicator->n_threads == 1 ) return to_send; + + if( id == 0 ) communicator->sent_object = to_send; + + bli_barrier( communicator, id ); + void * object = communicator->sent_object; + bli_barrier( communicator, id ); + + return object; +} + +thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + + thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) ); + bli_setup_thread_info( thr, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); + return thr; +} + +void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + thr->ocomm = ocomm; + thr->ocomm_id = ocomm_id; + thr->icomm = icomm; + thr->icomm_id = icomm_id; + + thr->n_way = n_way; + thr->work_id = work_id; +} + +void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ) +{ + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + + dim_t size = all_end - all_start; + dim_t n_pt = size / n_way; + n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt; + n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); + *start = work_id * n_pt + all_start; + *end = bli_min( *start + n_pt, size + all_start ); +} + +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) +{ + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + dim_t size = all_end - all_start; + + *start = 0; + *end = all_end - all_start; + + if( forward ) { + dim_t curr_caucus = n_way - 1; + dim_t len = 0; + dim_t num = size*size / n_way; // 2xArea per thread? + while(1){ + dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if( curr_caucus == work_id ) { + *start = bli_max( 0 , *end - width ) + all_start; + *end = *end + all_start; + return; + } + else{ + *end -= width; + len += width; + curr_caucus--; + } + } + } + else{ + dim_t num = size*size / n_way; + while(1){ + dim_t width = ceil(sqrt(*start * *start + num)) - *start; + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + + if( work_id == 0 ) { + *start = *start + all_start; + *end = bli_min( *start + width, all_end ); + return; + } + else{ + *start = *start + width; + } + work_id--; + } + } +} + +void bli_level3_thread_decorator( dim_t n_threads, + level3_int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + void* cntl, + void** thread ) +{ +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + func( alpha, + a, + b, + beta, + c, + cntl, + thread[omp_id] ); + } +#else + func( alpha, + a, + b, + beta, + c, + cntl, + thread[0] ); +#endif +} + +dim_t bli_read_nway_from_env( char* env ) +{ + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h new file mode 100644 index 000000000..aa8bd8152 --- /dev/null +++ b/frame/base/bli_threading.h @@ -0,0 +1,136 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef BLIS_THREADING_H +#define BLIS_THREADING_H + +#ifdef BLIS_TREE_BARRIER + struct barrier_s + { + int arity; + int count; + struct barrier_s* dad; + int signal; + }; + typedef struct barrier_s barrier_t; + + struct thread_comm_s + { + void* sent_object; + dim_t n_threads; + barrier_t** barriers; + }; +#else + struct thread_comm_s + { + void* sent_object; + dim_t n_threads; + + bool_t barrier_sense; + dim_t barrier_threads_arrived; + }; +#endif +typedef struct thread_comm_s thread_comm_t; + +// Thread Communicator Interface Definitions +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads ); +void bli_cleanup_communicator( thread_comm_t* communicator ); +thread_comm_t* bli_create_communicator( dim_t n_threads ); +void bli_free_communicator( thread_comm_t* communicator ); +void* bli_broadcast_structure( thread_comm_t* communicator, dim_t inside_id, void* to_send ); +void bli_barrier( thread_comm_t* communicator, dim_t thread_id ); + +struct thrinfo_s +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct used to parallelize the loop + dim_t work_id; //What we're working on +}; +typedef struct thrinfo_s thrinfo_t; + +// Thread Info Interface Definitions +#define thread_ocomm( thread ) (thread->ocomm) +#define thread_icomm( thread ) (thread->icomm) + +#define thread_id( thread ) (thread->ocomm_id) +#define thread_num_threads( thread ) (thread->ocomm->n_threads) + +#define thread_work_id( thread ) (thread->work_id) +#define thread_n_way( thread ) (thread->n_way) +#define thread_am_ochief( thread ) (thread->ocomm_id == 0) +#define thread_am_ichief( thread ) (thread->icomm_id == 0) + +#define thread_obroadcast( thread, ptr ) bli_broadcast_structure( thread->ocomm, thread->ocomm_id, ptr ) +#define thread_ibroadcast( thread, ptr ) bli_broadcast_structure( thread->icomm, thread->icomm_id, ptr ) +#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) +#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) + +void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end); +thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +dim_t bli_read_nway_from_env( char* env ); + +//void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); +//thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); + + +//TODO: These nneed to be included after the thread info and thread comm definitions +// But this doesn't seem like the best place to put these includes. +// Note that the bli_packm_threading.h must be included before the others! +#include "bli_packm_threading.h" +#include "bli_gemm_threading.h" +#include "bli_herk_threading.h" +#include "bli_trmm_threading.h" +#include "bli_trsm_threading.h" + +typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); +void bli_level3_thread_decorator( dim_t num_threads, + level3_int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + void* cntl, + void** thread ); + +#endif diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 088e9c0f2..7b2a2dfd4 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -43,4 +43,9 @@ extern obj_t BLIS_MINUS_ONE_HALF; extern obj_t BLIS_MINUS_ONE; extern obj_t BLIS_MINUS_TWO; +extern thread_comm_t BLIS_SINGLE_COMM; +extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; +extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +extern herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; + #endif diff --git a/frame/include/blis.h b/frame/include/blis.h index bcf2fe1cf..2ee68785b 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -65,6 +65,12 @@ extern "C" { #include "bli_type_defs.h" #include "bli_macro_defs.h" + +// -- Threading definitions -- +#include +#include "bli_threading.h" + +// -- Constant definitions -- #include "bli_extern_defs.h" diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.c b/kernels/bgq/1/bli_axpyv_opt_var1.c index 163e8bb49..8f389091b 100644 --- a/kernels/bgq/1/bli_axpyv_opt_var1.c +++ b/kernels/bgq/1/bli_axpyv_opt_var1.c @@ -52,7 +52,6 @@ void bli_daxpyv_opt_var1( } // Call the reference implementation if needed. if ( use_ref == TRUE ) { - printf("Defaulting to reference!"); BLIS_DAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy ); return; } diff --git a/kernels/bgq/1/bli_dotv_opt_var1.c b/kernels/bgq/1/bli_dotv_opt_var1.c index 2cb623db2..11bf4741d 100644 --- a/kernels/bgq/1/bli_dotv_opt_var1.c +++ b/kernels/bgq/1/bli_dotv_opt_var1.c @@ -85,7 +85,7 @@ void bli_ddotv_opt_var1( rhos += vec_extract( rhov, 2 ); rhos += vec_extract( rhov, 3 ); } - for ( dim_t i = n_left; i < n_left; i++ ) + for ( dim_t i = 0; i < n_left; i++ ) { rhos += x[4*n_run + i] * y[4*n_run + i]; } diff --git a/kernels/bgq/3/bli_gemm_8x8.c b/kernels/bgq/3/bli_gemm_8x8.c index 2c1842f41..e2fe3f8d2 100644 --- a/kernels/bgq/3/bli_gemm_8x8.c +++ b/kernels/bgq/3/bli_gemm_8x8.c @@ -33,28 +33,9 @@ */ #include "blis.h" - -void bli_sgemm_8x8( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) -{ - /* Just call the reference implementation. */ - BLIS_SGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - - +#undef restrict +#include +#include /* @@ -62,7 +43,7 @@ void bli_sgemm_8x8( * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread: * "Achievable peak can either be: * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz - * 2) 8.53 GF Takes into account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread + * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread" * * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. @@ -74,14 +55,14 @@ void bli_sgemm_8x8( */ void bli_dgemm_8x8( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) + dim_t k, + restrict double* alpha, + restrict double* a, + restrict double* b, + restrict double* beta, + restrict double* c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { //Registers for storing C. @@ -221,126 +202,169 @@ void bli_dgemm_8x8( UPDATE( AB, c, 4 ); } -void bli_dgemm_8x8_mt( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t tid - ) +void printvec(vector4double v) { - bli_dgemm_8x8( k, - alpha, - a, - b, beta, - c, - rs_c, cs_c, - data ); -} - -void bli_cgemm_8x8( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) -{ - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + double a = vec_extract(v, 0); + double b = vec_extract(v, 1); + double c = vec_extract(v, 2); + double d = vec_extract(v, 3); + printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d); } void bli_zgemm_8x8( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) + dim_t k, + dcomplex* alpha_z, + dcomplex* a_z, + dcomplex* b_z, + dcomplex* beta_z, + dcomplex* c_z, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { - /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + double * alpha = (double*) alpha_z; + double * beta = (double*) beta_z; + double * a = (double*) a_z; + double * b = (double*) b_z; + double * c = (double*) c_z; + + //Registers for storing C. + //2 2x4 subblocks of C, c0, and c1 + //Each sub-block has 4 columns, 0, 1, 2, 3 + //Each column has 2 partial sum, a and b, and contains 2 complex numbers. + vector4double c00a = vec_splats( 0.0 ); + vector4double c00b = vec_splats( 0.0 ); + vector4double c01a = vec_splats( 0.0 ); + vector4double c01b = vec_splats( 0.0 ); + vector4double c02a = vec_splats( 0.0 ); + vector4double c02b = vec_splats( 0.0 ); + vector4double c03a = vec_splats( 0.0 ); + vector4double c03b = vec_splats( 0.0 ); + + vector4double c10a = vec_splats( 0.0 ); + vector4double c10b = vec_splats( 0.0 ); + vector4double c11a = vec_splats( 0.0 ); + vector4double c11b = vec_splats( 0.0 ); + vector4double c12a = vec_splats( 0.0 ); + vector4double c12b = vec_splats( 0.0 ); + vector4double c13a = vec_splats( 0.0 ); + vector4double c13b = vec_splats( 0.0 ); + + + vector4double b0, b1, b2, b3; + vector4double a0, a1; + + for( dim_t i = 0; i < k; i++ ) + { + + b0 = vec_ld2a( 0 * sizeof(double), &b[8*i] ); + b1 = vec_ld2a( 2 * sizeof(double), &b[8*i] ); + b2 = vec_ld2a( 4 * sizeof(double), &b[8*i] ); + b3 = vec_ld2a( 6 * sizeof(double), &b[8*i] ); + + a0 = vec_lda ( 0 * sizeof(double), &a[8*i] ); + a1 = vec_lda ( 4 * sizeof(double), &a[8*i] ); + + c00a = vec_xmadd ( b0, a0, c00a ); + c00b = vec_xxcpnmadd( a0, b0, c00b ); + c01a = vec_xmadd ( b1, a0, c01a ); + c01b = vec_xxcpnmadd( a0, b1, c01b ); + + c02a = vec_xmadd ( b2, a0, c02a ); + c02b = vec_xxcpnmadd( a0, b2, c02b ); + c03a = vec_xmadd ( b3, a0, c03a ); + c03b = vec_xxcpnmadd( a0, b3, c03b ); + + + c10a = vec_xmadd ( b0, a1, c10a ); + c10b = vec_xxcpnmadd( a1, b0, c10b ); + c11a = vec_xmadd ( b1, a1, c11a ); + c11b = vec_xxcpnmadd( a1, b1, c11b ); + + c12a = vec_xmadd ( b2, a1, c12a ); + c12b = vec_xxcpnmadd( a1, b2, c12b ); + c13a = vec_xmadd ( b3, a1, c13a ); + c13b = vec_xxcpnmadd( a1, b3, c13b ); + + } + + // Create patterns for permuting the "b" parts of each vector + vector4double pattern = vec_gpci( 01032 ); + vector4double zed = vec_splats( 0.0 ); + + vector4double AB; + vector4double C = vec_splats( 0.0 ); + vector4double C1 = vec_splats( 0.0 ); + vector4double C2 = vec_splats( 0.0 ); + + double alphar = *alpha; + double alphai = *(alpha+1); + double betar = *beta; + double betai = *(beta+1); + vector4double alphav = vec_splats( 0.0 ); + vector4double betav = vec_splats( 0.0 ); + alphav = vec_insert( alphar, alphav, 0); + alphav = vec_insert( alphai, alphav, 1); + alphav = vec_insert( alphar, alphav, 2); + alphav = vec_insert( alphai, alphav, 3); + betav = vec_insert( betar, betav, 0); + betav = vec_insert( betai, betav, 1); + betav = vec_insert( betar, betav, 2); + betav = vec_insert( betai, betav, 3); + double ct; + + + //Macro to update 2 elements of C in a column. + //REG1 is the register holding the first partial sum of those 2 elements + //REG2 is the register holding the second partial sum of those 2 elements + //ADDR is the address to write them to + //OFFSET is the number of rows from ADDR to write to +#define ZUPDATE( REG1, REG2, ADDR, OFFSET ) \ +{ \ + ct = *(ADDR + (OFFSET + 0) * rs_c); \ + C = vec_insert( ct, C, 0 ); \ + ct = *(ADDR + (OFFSET + 0) * rs_c + 1); \ + C = vec_insert( ct, C, 1 ); \ + ct = *(ADDR + (OFFSET + 2) * rs_c); \ + C = vec_insert( ct, C, 2 ); \ + ct = *(ADDR + (OFFSET + 2) * rs_c + 1); \ + C = vec_insert( ct, C, 3 ); \ + \ + AB = vec_sub(REG1, REG2 ); \ + \ + /* Scale by alpha */ \ + REG1 = vec_xmadd( alphav, AB, zed ); \ + REG2 = vec_xxcpnmadd( AB, alphav, zed ); \ + AB = vec_sub(REG1, REG2 ); \ + \ + \ + /* Scale by beta */ \ + REG1 = vec_xmadd( betav, C, zed ); \ + REG2 = vec_xxcpnmadd( C, betav, zed ); \ + C = vec_sub(REG1, REG2 ); \ + \ + /* Add AB to C */ \ + C = vec_add( AB, C ); \ + \ + ct = vec_extract( C, 0 ); \ + *(ADDR + (OFFSET + 0) * rs_c) = ct; \ + ct = vec_extract( C, 1 ); \ + *(ADDR + (OFFSET + 0) * rs_c + 1) = ct; \ + ct = vec_extract( C, 2 ); \ + *(ADDR + (OFFSET + 2) * rs_c) = ct; \ + ct = vec_extract( C, 3 ); \ + *(ADDR + (OFFSET + 2) * rs_c + 1) = ct; \ } -void bli_sgemm_8x8_mt( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_SGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - -void bli_cgemm_8x8_mt( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - -void bli_zgemm_8x8_mt( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + ZUPDATE( c00a, c00b, c, 0 ); + ZUPDATE( c10a, c10b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c01a, c01b, c, 0 ); + ZUPDATE( c11a, c11b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c02a, c02b, c, 0 ); + ZUPDATE( c12a, c12b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c03a, c03b, c, 0 ); + ZUPDATE( c13a, c13b, c, 4 ); } diff --git a/kernels/bgq/3/bli_gemm_8x8.h b/kernels/bgq/3/bli_gemm_8x8.h index 75401eecb..b6ce51824 100644 --- a/kernels/bgq/3/bli_gemm_8x8.h +++ b/kernels/bgq/3/bli_gemm_8x8.h @@ -63,8 +63,7 @@ void PASTEMAC(ch,varname)( \ ctype* b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - dim_t tid \ + auxinfo_t* data \ ); INSERT_GENTPROT_BASIC( gemm_8x8_mt ) diff --git a/mpi_test/Makefile b/mpi_test/Makefile new file mode 100644 index 000000000..cb317471a --- /dev/null +++ b/mpi_test/Makefile @@ -0,0 +1,323 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all \ + blis essl \ + clean cleanx + + + +# +# --- Makefile initialization -------------------------------------------------- +# + +# Define the name of the configuration file. +CONFIG_MK_FILE := config.mk + +# Define the name of the file containing build and architecture-specific +# makefile definitions. +MAKE_DEFS_FILE := make_defs.mk + +# Locations of important files. +ROOT_PATH := .. +CONFIG_DIR := config + + + +# +# --- Include makefile configuration file -------------------------------------- +# + +# Construct the path to the makefile configuration file that was generated by +# the configure script. +CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE) + +# Include the configuration file. +-include $(CONFIG_MK_PATH) + +# Detect whether we actually got the configuration file. If we didn't, then +# it is likely that the user has not yet generated it (via configure). +ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes) +CONFIG_MK_PRESENT := yes +else +CONFIG_MK_PRESENT := no +endif + +# Now we have access to CONFIG_NAME, which tells us which sub-directory of the +# config directory to use as our configuration. +CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME) + + + +# +# --- Include makefile definitions file ---------------------------------------- +# + +# Construct the path to the makefile definitions file residing inside of +# the configuration sub-directory. +MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE) + +# Include the makefile definitions file. +-include $(MAKE_DEFS_MK_PATH) + +# Detect whether we actually got the make definitios file. If we didn't, then +# it is likely that the configuration is invalid (or incomplete). +ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes) +MAKE_DEFS_MK_PRESENT := yes +else +MAKE_DEFS_MK_PRESENT := no +endif + + + +# +# --- BLAS and LAPACK implementations ------------------------------------------ +# + +# BLIS library and header path. This is simply wherever it was installed. +BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib +BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis + +# BLIS library. +BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a + +# BLAS library path(s). This is where the BLAS libraries reside. +BLAS_LIB_PATH := $(HOME)/flame/lib +MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64/ +ESSL_LIB_PATH := /soft/libraries/essl/current/lib64 + +# OpenBLAS +OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a + +# ATLAS +ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \ + $(BLAS_LIB_PATH)/libatlas.a + +# MKL +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_sequential \ + -lmkl_core \ + -lmkl_intel_lp64 + +# ESSL +# Note: ESSL is named differently for SMP and/or BG +ESSL_LIB := $(ESSL_LIB_PATH)/libesslsmpbg.a \ + -L$(IBM_MAIN_DIR)/xlsmp/bg/3.1/bglib64/ \ + -L$(IBM_MAIN_DIR)/xlf/bg/14.1/bglib64/ \ + -lxlsmp -lxlf90_r -lxlfmath -lxl + +# Accelerate +MAC_LIB := -framework Accelerate + + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c)) + +# Override CFLAGS from make_defs.mk here, if desired. +#CFLAGS := -g -O2 -march=native + +# Add installed and local header paths to CFLAGS +CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) + +LINKER := $(CC) +#LDFLAGS := -L/home/00146/field/gnu/gcc-4.8.2/lib64 +#LDFLAGS += -lgfortran -lm -lpthread + + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +# Complete list of possible targets when defining 'all': +# +# blis openblas atlas mkl mac essl +# +all: blis essl + +blis: test_gemm_blis.x \ + test_hemm_blis.x \ + test_herk_blis.x \ + test_her2k_blis.x \ + test_trmm_blis.x \ + test_trsm_blis.x + +essl: test_gemm_essl.x \ + test_hemm_essl.x \ + test_herk_essl.x \ + test_her2k_essl.x \ + test_trmm_essl.x \ + test_trsm_essl.x + +openblas: test_gemv_openblas.x \ + test_ger_openblas.x \ + test_hemv_openblas.x \ + test_her_openblas.x \ + test_her2_openblas.x \ + test_trmv_openblas.x \ + test_trsv_openblas.x \ + \ + test_gemm_openblas.x \ + test_hemm_openblas.x \ + test_herk_openblas.x \ + test_her2k_openblas.x \ + test_trmm_openblas.x \ + test_trsm_openblas.x + +atlas: test_gemv_atlas.x \ + test_ger_atlas.x \ + test_hemv_atlas.x \ + test_her_atlas.x \ + test_her2_atlas.x \ + test_trmv_atlas.x \ + test_trsv_atlas.x \ + \ + test_gemm_atlas.x \ + test_hemm_atlas.x \ + test_herk_atlas.x \ + test_her2k_atlas.x \ + test_trmm_atlas.x \ + test_trsm_atlas.x + +mkl: test_gemv_mkl.x \ + test_ger_mkl.x \ + test_hemv_mkl.x \ + test_her_mkl.x \ + test_her2_mkl.x \ + test_trmv_mkl.x \ + test_trsv_mkl.x \ + \ + test_gemm_mkl.x \ + test_hemm_mkl.x \ + test_herk_mkl.x \ + test_her2k_mkl.x \ + test_trmm_mkl.x \ + test_trsm_mkl.x + +mac: test_gemv_mac.x \ + test_ger_mac.x \ + test_hemv_mac.x \ + test_her_mac.x \ + test_her2_mac.x \ + test_trmv_mac.x \ + test_trsv_mac.x \ + \ + test_gemm_mac.x \ + test_hemm_mac.x \ + test_herk_mac.x \ + test_her2k_mac.x \ + test_trmm_mac.x \ + test_trsm_mac.x + + + +# --Object file rules -- + +$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c + $(CC) $(CFLAGS) -c $< -o $@ + +test_%_openblas.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@ + +test_%_atlas.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@ + +test_%_mkl.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@ + +test_%_essl.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@ + +test_%_mac.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@ + +test_%_blis.o: test_%.c + $(CC) $(CFLAGS) -DBLIS -c $< -o $@ + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_openblas.x: test_%_openblas.o $(BLIS_LIB) + $(LINKER) $< $(OPENBLAS_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_atlas.x: test_%_atlas.o $(BLIS_LIB) + $(LINKER) $< $(ATLAS_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_mkl.x: test_%_mkl.o $(BLIS_LIB) + $(LINKER) $< $(MKL_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_essl.x: test_%_essl.o $(BLIS_LIB) + $(LINKER) $< $(ESSL_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_mac.x: test_%_mac.o $(BLIS_LIB) + $(LINKER) $< $(MAC_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_blis.x: test_%_blis.o $(BLIS_LIB) + $(LINKER) $< $(BLIS_LIB) $(LDFLAGS) -o $@ + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.o *.x + diff --git a/mpi_test/test_gemm.c b/mpi_test/test_gemm.c new file mode 100644 index 000000000..5864e667a --- /dev/null +++ b/mpi_test/test_gemm.c @@ -0,0 +1,232 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// transa transb m n k alpha a lda b ldb beta c ldc +//void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input, k_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_b, k, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + + bli_setsc( (0.9/1.0), 0.2, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef BLIS + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_gemm( &alpha, + //bli_gemm4m( &alpha, + &a, + &b, + &beta, + &c ); + +#else + if ( bli_is_real( dt_a ) ) + { + f77_char transa = 'N'; + f77_char transb = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dgemm_( &transa, + &transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else + { + f77_char transa = 'N'; + f77_char transb = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + dcomplex* alphap = bli_obj_buffer( alpha ); + dcomplex* ap = bli_obj_buffer( a ); + dcomplex* bp = bli_obj_buffer( b ); + dcomplex* betap = bli_obj_buffer( beta ); + dcomplex* cp = bli_obj_buffer( c ); + + zgemm_( &transa, + //zgemm3m_( &transa, + &transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_gemm_blis" ); +#else + printf( "data_gemm_%s", BLAS ); +#endif + printf( "( %2lu, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, dtime_save, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_hemm.c b/mpi_test/test_hemm.c new file mode 100644 index 000000000..4cab93ceb --- /dev/null +++ b/mpi_test/test_hemm.c @@ -0,0 +1,252 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uploa m n alpha a lda b ldb beta c ldc +//void dsymm_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, a ); + bli_obj_set_uplo( uplo, a ); + + // Randomize A, make it densely Hermitian, and zero the unstored + // triangle to ensure the implementation reads only from the stored + // region. + bli_randm( &a ); + bli_mkherm( &a ); + bli_mktrim( &a ); +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( 1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( -1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + bli_scalm( &BLIS_TWO, &a ); +*/ + + + bli_setsc( (2.0/1.0), 1.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_hemm( side, + //bli_hemm4m( side, + &alpha, + &a, + &b, + &beta, + &c ); +#else + + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsymm_( &side, + &uplo, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%9.5f", "" ); + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 2.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 2.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_hemm_blis" ); +#else + printf( "data_hemm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_her2k.c b/mpi_test/test_her2k.c new file mode 100644 index 000000000..f44ca4fb7 --- /dev/null +++ b/mpi_test/test_her2k.c @@ -0,0 +1,209 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// uploa transa m k alpha a lda b ldb beta c ldc +//void dsyr2k_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; + + uplo = BLIS_LOWER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_b, m, k, 0, 0, &b ); + bli_obj_create( dt_c, m, m, 0, 0, &c ); + bli_obj_create( dt_c, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, c ); + bli_obj_set_uplo( uplo, c ); + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_her2k( &alpha, + &a, + &b, + &beta, + &c ); + +#else + + f77_char uploa = 'L'; + f77_char transa = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsyr2k_( &uploa, + &transa, + &mm, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_her2k_blis" ); +#else + printf( "data_her2k_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_herk.c b/mpi_test/test_herk.c new file mode 100644 index 000000000..ffe9ab85f --- /dev/null +++ b/mpi_test/test_herk.c @@ -0,0 +1,200 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// uploa transa m k alpha a lda beta c ldc +//void dsyrk_( char*, char*, int*, int*, double*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + num_t dt_a, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + + dt_a = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; + + uplo = BLIS_LOWER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_c, m, m, 0, 0, &c ); + bli_obj_create( dt_c, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, c ); + bli_obj_set_uplo( uplo, c ); + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_herk( &alpha, + &a, + &beta, + &c ); + +#else + + f77_char uploa = 'L'; + f77_char transa = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsyrk_( &uploa, + &transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_herk_blis" ); +#else + printf( "data_herk_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_trmm.c b/mpi_test/test_trmm.c new file mode 100644 index 000000000..4d8112be8 --- /dev/null +++ b/mpi_test/test_trmm.c @@ -0,0 +1,246 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uplo trans diag m n alpha a lda b ldb +//void dtrmm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_obj_set_struc( BLIS_TRIANGULAR, a ); + bli_obj_set_uplo( uplo, a ); + + bli_randm( &a ); + bli_randm( &c ); + bli_randm( &b ); + +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( -1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( 1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + //bli_scalm( &BLIS_TWO, &a ); +*/ + + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + + +#ifdef PRINT + +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_trmm( side, + //bli_trmm4m( side, + &alpha, + &a, + &c ); + +#else + + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* cp = bli_obj_buffer( c ); + + dtrmm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_trmm_blis" ); +#else + printf( "data_trmm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_trsm.c b/mpi_test/test_trsm.c new file mode 100644 index 000000000..563bbdaaf --- /dev/null +++ b/mpi_test/test_trsm.c @@ -0,0 +1,282 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uplo trans diag m n alpha a lda b ldb +//void dtrsm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_FLOAT; + //dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_SCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_obj_set_struc( BLIS_TRIANGULAR, a ); + bli_obj_set_uplo( uplo, a ); + //bli_obj_set_diag( BLIS_UNIT_DIAG, a ); + + bli_randm( &a ); + bli_randm( &c ); + bli_randm( &b ); + +/* + { + obj_t a2; + + bli_obj_alias_to( a, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_inc_diag_off( 1, a2 ); + bli_setm( &BLIS_ZERO, &a2 ); + bli_obj_inc_diag_off( -2, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a2 ); + bli_scalm( &BLIS_TWO, &a2 ); + //bli_scalm( &BLIS_TWO, &a ); + } +*/ + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + + +#ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + + bli_invertd( &a ); + bli_printm( "a", &a, "%4.1f", "" ); + bli_invertd( &a ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_trsm( side, + //bli_trsm4m( side, + //bli_trsm3m( side, + &alpha, + &a, + &c ); +#else + + if ( bli_is_real( dt_a ) ) + { + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + float * alphap = bli_obj_buffer( alpha ); + float * ap = bli_obj_buffer( a ); + float * cp = bli_obj_buffer( c ); + + strsm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); + } + else // if ( bli_is_complex( dt_a ) ) + { + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + scomplex* alphap = bli_obj_buffer( alpha ); + scomplex* ap = bli_obj_buffer( a ); + scomplex* cp = bli_obj_buffer( c ); + + ctrsm_( &side, + //ztrsm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); + } + +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_trsm_blis" ); +#else + printf( "data_trsm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index d7cc59796..3ce1b688f 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -221,8 +221,8 @@ void libblis_test_gemm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap ); - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 25d31036a..e2701d313 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -251,10 +251,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); // Create subpartitions from the a and b panels. @@ -268,7 +268,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 48c6ddadf..234bec68d 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -217,14 +217,14 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c );