From 01b125e815f19410e8e0611d088b84570e499e93 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Feb 2014 11:55:45 -0600 Subject: [PATCH 01/42] First pass at adding parallelism to BLIS. Added a multithreading infrastructure that should be independent of multithreading implementation in the future. Currently, gemm blocked variants 1f and 2f, and packm variant blocked variant 1 is parallelized. --- config/reference/bli_config.h | 2 +- config/reference/make_defs.mk | 8 +- frame/1m/packm/bli_packm_blk_var1.c | 19 ++- frame/1m/packm/bli_packm_blk_var1.h | 6 +- frame/1m/packm/bli_packm_check.c | 2 + frame/1m/packm/bli_packm_int.c | 9 +- frame/1m/packm/bli_packm_int.h | 3 +- frame/2/gemv/bli_gemv_blk_var1.c | 3 +- frame/2/gemv/bli_gemv_blk_var2.c | 3 +- frame/2/ger/bli_ger_blk_var1.c | 3 +- frame/2/ger/bli_ger_blk_var2.c | 3 +- frame/2/hemv/bli_hemv_blk_var1.c | 3 +- frame/2/hemv/bli_hemv_blk_var2.c | 3 +- frame/2/hemv/bli_hemv_blk_var3.c | 3 +- frame/2/hemv/bli_hemv_blk_var4.c | 3 +- frame/2/her/bli_her_blk_var1.c | 3 +- frame/2/her/bli_her_blk_var2.c | 3 +- frame/2/her2/bli_her2_blk_var1.c | 3 +- frame/2/her2/bli_her2_blk_var2.c | 3 +- frame/2/her2/bli_her2_blk_var3.c | 3 +- frame/2/her2/bli_her2_blk_var4.c | 3 +- frame/2/trmv/bli_trmv_l_blk_var1.c | 3 +- frame/2/trmv/bli_trmv_l_blk_var2.c | 3 +- frame/2/trmv/bli_trmv_u_blk_var1.c | 3 +- frame/2/trmv/bli_trmv_u_blk_var2.c | 3 +- frame/2/trsv/bli_trsv_l_blk_var1.c | 3 +- frame/2/trsv/bli_trsv_l_blk_var2.c | 3 +- frame/2/trsv/bli_trsv_u_blk_var1.c | 3 +- frame/2/trsv/bli_trsv_u_blk_var2.c | 3 +- frame/3/gemm/bli_gemm_blk_var1f.c | 119 ++++++++++----- frame/3/gemm/bli_gemm_blk_var1f.h | 3 +- frame/3/gemm/bli_gemm_blk_var2f.c | 115 +++++++++----- frame/3/gemm/bli_gemm_blk_var2f.h | 3 +- frame/3/gemm/bli_gemm_blk_var3f.c | 108 ++++++++----- frame/3/gemm/bli_gemm_blk_var3f.h | 3 +- frame/3/gemm/bli_gemm_cntl.c | 11 ++ frame/3/gemm/bli_gemm_cntl.h | 1 + frame/3/gemm/bli_gemm_front.c | 21 ++- frame/3/gemm/bli_gemm_int.c | 9 +- frame/3/gemm/bli_gemm_int.h | 3 +- frame/3/gemm/bli_gemm_ker_var2.c | 3 +- frame/3/gemm/bli_gemm_ker_var2.h | 3 +- frame/3/gemm/bli_gemm_ker_var5.c | 3 +- frame/3/gemm/bli_gemm_ker_var5.h | 3 +- frame/3/hemm/bli_hemm_front.c | 3 +- frame/3/herk/bli_herk_blk_var1f.c | 9 +- frame/3/herk/bli_herk_blk_var2f.c | 9 +- frame/3/herk/bli_herk_blk_var3f.c | 9 +- frame/3/symm/bli_symm_front.c | 3 +- frame/3/trmm/bli_trmm_blk_var1f.c | 9 +- frame/3/trmm/bli_trmm_blk_var2b.c | 9 +- frame/3/trmm/bli_trmm_blk_var2f.c | 9 +- frame/3/trmm/bli_trmm_blk_var3b.c | 9 +- frame/3/trmm/bli_trmm_blk_var3f.c | 9 +- frame/3/trsm/bli_trsm_blk_var1b.c | 6 +- frame/3/trsm/bli_trsm_blk_var1f.c | 6 +- frame/3/trsm/bli_trsm_blk_var2b.c | 9 +- frame/3/trsm/bli_trsm_blk_var2f.c | 9 +- frame/3/trsm/bli_trsm_blk_var3b.c | 9 +- frame/3/trsm/bli_trsm_blk_var3f.c | 9 +- frame/base/bli_init.c | 5 +- frame/base/bli_mem.c | 5 +- frame/base/bli_threading.c | 228 ++++++++++++++++++++++++++++ frame/base/bli_threading.h | 107 +++++++++++++ frame/include/bli_extern_defs.h | 2 + frame/include/blis.h | 6 + testsuite/src/test_gemm_ukr.c | 4 +- testsuite/src/test_gemmtrsm_ukr.c | 6 +- testsuite/src/test_trsm_ukr.c | 4 +- 69 files changed, 788 insertions(+), 228 deletions(-) create mode 100644 frame/base/bli_threading.c create mode 100644 frame/base/bli_threading.h diff --git a/config/reference/bli_config.h b/config/reference/bli_config.h index b779d59df..d643a1f41 100644 --- a/config/reference/bli_config.h +++ b/config/reference/bli_config.h @@ -69,7 +69,7 @@ // -- MULTITHREADING ----------------------------------------------------------- // The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 +#define BLIS_MAX_NUM_THREADS 2 diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index ab2b5a462..a1e884808 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -76,14 +76,14 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- -CC := gcc +CC := gcc-4.8 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 # -fopenmp -pg +CMISCFLAGS := -std=c99 -fopenmp # -pg CDBGFLAGS := -g CWARNFLAGS := -Wall -COPTFLAGS := -O2 +COPTFLAGS := -O0 -g CKOPTFLAGS := $(COPTFLAGS) CVECFLAGS := #-msse3 -march=native # -mfpmath=sse @@ -100,7 +100,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -lm +LDFLAGS := -fopenmp -lm diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index f74dc6ccd..93c164a6d 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); void bli_packm_blk_var1( obj_t* c, - obj_t* p ) + obj_t* p, + thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -117,7 +119,8 @@ void bli_packm_blk_var1( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -140,7 +143,8 @@ void PASTEMAC(ch,varname )( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -183,6 +187,9 @@ void PASTEMAC(ch,varname )( \ to pack it. */ \ if ( bli_is_zeros( uploc ) && \ bli_is_triangular( strucc ) ) return; \ +\ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ \ /* Extract the conjugation bit from the transposition argument. */ \ conjc = bli_extract_conj( transc ); \ @@ -260,8 +267,8 @@ void PASTEMAC(ch,varname )( \ \ p_begin = p_cast; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ + ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 5a2c356a5..a15173205 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var1( obj_t* c, - obj_t* p ); + obj_t* p, + thrinfo_t* t ); #undef GENTPROT @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( packm_blk_var1 ) diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index 1501f475f..500a5c85d 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -66,6 +66,8 @@ void bli_packm_int_check( obj_t* a, bli_check_error_code( e_val ); e_val = bli_check_floating_object( p ); + if(e_val == BLIS_EXPECTED_FLOATING_POINT_DATATYPE) + printf("HI\n"); bli_check_error_code( e_val ); // Check object dimensions. diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 1f41bbbe8..3d9adc203 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -37,7 +37,8 @@ #define FUNCPTR_T packm_fp typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* p ); + obj_t* p, + thrinfo_t* t ); static FUNCPTR_T vars[6][3] = { @@ -52,7 +53,8 @@ static FUNCPTR_T vars[6][3] = void bli_packm_int( obj_t* a, obj_t* p, - packm_t* cntl ) + packm_t* cntl, + thrinfo_t* thread ) { varnum_t n; impl_t i; @@ -119,6 +121,7 @@ void bli_packm_int( obj_t* a, // Invoke the variant with kappa_use. f( a, - p ); + p, + thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 923dcbc3c..3dc5aa73b 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -34,5 +34,6 @@ void bli_packm_int( obj_t* a, obj_t* p, - packm_t* cntl ); + packm_t* cntl, + thrinfo_t* thread ); diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/bli_gemv_blk_var1.c index c4701ea6e..5f66e1c33 100644 --- a/frame/2/gemv/bli_gemv_blk_var1.c +++ b/frame/2/gemv/bli_gemv_blk_var1.c @@ -76,7 +76,8 @@ void bli_gemv_blk_var1( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/bli_gemv_blk_var2.c index a9ea14856..0fb05bd28 100644 --- a/frame/2/gemv/bli_gemv_blk_var2.c +++ b/frame/2/gemv/bli_gemv_blk_var2.c @@ -81,7 +81,8 @@ void bli_gemv_blk_var2( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/bli_ger_blk_var1.c index 4d92a941c..7944af2dd 100644 --- a/frame/2/ger/bli_ger_blk_var1.c +++ b/frame/2/ger/bli_ger_blk_var1.c @@ -75,7 +75,8 @@ void bli_ger_blk_var1( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/bli_ger_blk_var2.c index 3855ca895..e5040aeb2 100644 --- a/frame/2/ger/bli_ger_blk_var2.c +++ b/frame/2/ger/bli_ger_blk_var2.c @@ -75,7 +75,8 @@ void bli_ger_blk_var2( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/bli_hemv_blk_var1.c index 03d942970..1e711aada 100644 --- a/frame/2/hemv/bli_hemv_blk_var1.c +++ b/frame/2/hemv/bli_hemv_blk_var1.c @@ -106,7 +106,8 @@ void bli_hemv_blk_var1( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/bli_hemv_blk_var2.c index 05211f522..0d3de9773 100644 --- a/frame/2/hemv/bli_hemv_blk_var2.c +++ b/frame/2/hemv/bli_hemv_blk_var2.c @@ -109,7 +109,8 @@ void bli_hemv_blk_var2( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/bli_hemv_blk_var3.c index d7d8fbc33..934eb0b74 100644 --- a/frame/2/hemv/bli_hemv_blk_var3.c +++ b/frame/2/hemv/bli_hemv_blk_var3.c @@ -106,7 +106,8 @@ void bli_hemv_blk_var3( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/bli_hemv_blk_var4.c index e8c5e739b..d1fe06018 100644 --- a/frame/2/hemv/bli_hemv_blk_var4.c +++ b/frame/2/hemv/bli_hemv_blk_var4.c @@ -109,7 +109,8 @@ void bli_hemv_blk_var4( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/bli_her_blk_var1.c index 64089bd6b..6501fd708 100644 --- a/frame/2/her/bli_her_blk_var1.c +++ b/frame/2/her/bli_her_blk_var1.c @@ -90,7 +90,8 @@ void bli_her_blk_var1( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/bli_her_blk_var2.c index e88f2a29e..5579d0009 100644 --- a/frame/2/her/bli_her_blk_var2.c +++ b/frame/2/her/bli_her_blk_var2.c @@ -90,7 +90,8 @@ void bli_her_blk_var2( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/bli_her2_blk_var1.c index e8896d026..831ef10f1 100644 --- a/frame/2/her2/bli_her2_blk_var1.c +++ b/frame/2/her2/bli_her2_blk_var1.c @@ -101,7 +101,8 @@ void bli_her2_blk_var1( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/bli_her2_blk_var2.c index ca165872f..59fd7a9c4 100644 --- a/frame/2/her2/bli_her2_blk_var2.c +++ b/frame/2/her2/bli_her2_blk_var2.c @@ -104,7 +104,8 @@ void bli_her2_blk_var2( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/bli_her2_blk_var3.c index e1ac9d555..eb0c15a53 100644 --- a/frame/2/her2/bli_her2_blk_var3.c +++ b/frame/2/her2/bli_her2_blk_var3.c @@ -104,7 +104,8 @@ void bli_her2_blk_var3( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/bli_her2_blk_var4.c index d0a8eaf9f..252d969fc 100644 --- a/frame/2/her2/bli_her2_blk_var4.c +++ b/frame/2/her2/bli_her2_blk_var4.c @@ -101,7 +101,8 @@ void bli_her2_blk_var4( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, - cntl_sub_packm_c11( cntl ) ); + cntl_sub_packm_c11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/bli_trmv_l_blk_var1.c index c9260d7a6..431618cff 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var1.c +++ b/frame/2/trmv/bli_trmv_l_blk_var1.c @@ -80,7 +80,8 @@ void bli_trmv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/bli_trmv_l_blk_var2.c index dd6493069..d78427051 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var2.c +++ b/frame/2/trmv/bli_trmv_l_blk_var2.c @@ -80,7 +80,8 @@ void bli_trmv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/bli_trmv_u_blk_var1.c index e50293f9d..c4493310c 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var1.c +++ b/frame/2/trmv/bli_trmv_u_blk_var1.c @@ -80,7 +80,8 @@ void bli_trmv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/bli_trmv_u_blk_var2.c index d5c491daf..36048a91e 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var2.c +++ b/frame/2/trmv/bli_trmv_u_blk_var2.c @@ -80,7 +80,8 @@ void bli_trmv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/bli_trsv_l_blk_var1.c index 6ffdd541e..c98537f71 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var1.c +++ b/frame/2/trsv/bli_trsv_l_blk_var1.c @@ -85,7 +85,8 @@ void bli_trsv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/bli_trsv_l_blk_var2.c index 9740a9ec0..3f258ae57 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var2.c +++ b/frame/2/trsv/bli_trsv_l_blk_var2.c @@ -85,7 +85,8 @@ void bli_trsv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/bli_trsv_u_blk_var1.c index 8d65e36fd..ce3a96eff 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var1.c +++ b/frame/2/trsv/bli_trsv_u_blk_var1.c @@ -85,7 +85,8 @@ void bli_trsv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/bli_trsv_u_blk_var2.c index 16a167b17..7f2a8e443 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var2.c +++ b/frame/2/trsv/bli_trsv_u_blk_var2.c @@ -85,7 +85,8 @@ void bli_trsv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, - cntl_sub_packm_a11( cntl ) ); + cntl_sub_packm_a11( cntl ), + &BLIS_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index e425f44fd..5024e8121 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -37,45 +37,62 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1, c1_pack; + //The s is for "lives on the stack" + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_chief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + b_pack = thread_broadcast( thread, &b_pack_s ); + + // Initialize objects passed into bli_packm_init for A and C + if( thread_am_caucus_chief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_caucus_broadcast( thread, &a1_pack_s ); + c1_pack = thread_caucus_broadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + thread ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + dim_t start, end; + bli_get_range( thread, m_trans, 8, &start, &end ); // Partition along the m dimension. - for ( i = 0; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,38 +100,58 @@ void bli_gemm_blk_var1f( obj_t* a, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); - + if( !thread_am_caucus_chief( thread ) ) + printf("DOGS\n"); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_caucus_chief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_caucus_barrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + thread_sub_caucus( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + thread_sub_caucus( thread ) ); + + // Packing must be done before computation. + thread_caucus_barrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_gemm( cntl ) ); + c1_pack, + cntl_sub_gemm( cntl ), + thread_sub_caucus( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_caucus_chief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_caucus_barrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); - bli_obj_release_pack( &c1_pack ); + thread_barrier( thread ); + if( thread_am_chief( thread ) ) + bli_obj_release_pack( b_pack ); + if( thread_am_caucus_chief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.h b/frame/3/gemm/bli_gemm_blk_var1f.h index 4e5bfcf36..c7ff240b0 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.h +++ b/frame/3/gemm/bli_gemm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index c9f29ee7b..c438d4607 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -37,45 +37,61 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; + + if( thread_am_chief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_broadcast( thread, &a_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_caucus_chief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_caucus_broadcast( thread, &b1_pack_s ); + c1_pack = thread_caucus_broadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + thread ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + dim_t start, end; + bli_get_range( thread, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -85,36 +101,55 @@ void bli_gemm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_caucus_chief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_caucus_barrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + thread_sub_caucus( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + bli_packm_int( &c1, c1_pack, + cntl_sub_packm_c( cntl ), + thread_sub_caucus( thread ) ); + + // Packing must be done before computation + thread_caucus_barrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_gemm( cntl ) ); + c1_pack, + cntl_sub_gemm( cntl ), + thread_sub_caucus( thread) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + if( thread_am_caucus_chief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_caucus_barrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_barrier( thread ); + if( thread_am_chief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_caucus_chief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var2f.h b/frame/3/gemm/bli_gemm_blk_var2f.h index 01a4c175a..488966cab 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.h +++ b/frame/3/gemm/bli_gemm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 97bcd5d87..b1378de59 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -37,45 +37,60 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_chief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_broadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_caucus_chief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_caucus_broadcast( thread, &a1_pack_s ); + b1_pack = thread_caucus_broadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + thread ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + dim_t start, end; + bli_get_range( thread, k_trans, 1, &start, &end ); // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, k_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. @@ -85,26 +100,35 @@ void bli_gemm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_caucus_chief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_caucus_barrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + bli_packm_int( &a1, a1_pack, + cntl_sub_packm_a( cntl ), + thread_sub_caucus( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + bli_packm_int( &b1, b1_pack, + cntl_sub_packm_b( cntl ), + thread_sub_caucus( thread ) ); + + // Packing must be done before computation. + thread_caucus_barrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_gemm( cntl ) ); + c_pack, + cntl_sub_gemm( cntl ), + thread_sub_caucus( thread) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it @@ -112,17 +136,21 @@ void bli_gemm_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( &c_pack ); + if ( i == 0 ) bli_obj_scalar_reset( c_pack ); } // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, + bli_unpackm_int( c_pack, c, cntl_sub_unpackm_c( cntl ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + thread_barrier( thread ); + if( thread_am_caucus_chief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } + if( thread_am_chief( thread ) ) + bli_obj_release_pack( c_pack ); } diff --git a/frame/3/gemm/bli_gemm_blk_var3f.h b/frame/3/gemm/bli_gemm_blk_var3f.h index ba6716215..8bbbde559 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.h +++ b/frame/3/gemm/bli_gemm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 53dcf86be..19458e52e 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,6 +55,17 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; +dim_t gemm_caucuses_at_level[5] = {1, 1, 2, 1, 1}; + +thrinfo_t* bli_gemm_cntl_get_thrinfos() +{ + return bli_create_thread_info( gemm_caucuses_at_level, 5 ); +} + +void bli_gemm_cntl_free_thrinfos(thrinfo_t* tofree) +{ + //MEMORYLEAK +} void bli_gemm_cntl_init() { diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 882b746eb..6a94bf892 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -65,3 +65,4 @@ gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, gemm_t* sub_gemm, unpackm_t* sub_unpack_c ); +thrinfo_t* bli_gemm_cntl_get_thrinfos(); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index d139f6b3c..c1d48cc27 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -74,12 +74,21 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } + thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); + dim_t n_threads = thread_num_threads( (&infos[0]) ); + // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + bli_gemm_int( alpha, + &a_local, + &b_local, + beta, + &c_local, + cntl, + &infos[omp_id] ); + } } diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index f6fc6d284..44c4bdca1 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); static FUNCPTR_T vars[6][3] = { @@ -57,7 +58,8 @@ void bli_gemm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -133,6 +135,7 @@ void bli_gemm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 5181f3253..cc2e4a929 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -37,5 +37,6 @@ void bli_gemm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index b800c0afe..ab5585d7c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -54,7 +54,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.h b/frame/3/gemm/bli_gemm_ker_var2.h index 62ebd6041..e41ee44be 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.h +++ b/frame/3/gemm/bli_gemm_ker_var2.h @@ -39,7 +39,8 @@ void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); // diff --git a/frame/3/gemm/bli_gemm_ker_var5.c b/frame/3/gemm/bli_gemm_ker_var5.c index 2e4599995..d89f0ccd3 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.c +++ b/frame/3/gemm/bli_gemm_ker_var5.c @@ -54,7 +54,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5); void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ) + gemm_t* cntl, + thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_ker_var5.h b/frame/3/gemm/bli_gemm_ker_var5.h index 6c79226ea..dee007158 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.h +++ b/frame/3/gemm/bli_gemm_ker_var5.h @@ -39,7 +39,8 @@ void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, - gemm_t* cntl ); + gemm_t* cntl, + thrinfo_t* thread ); // diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index eaaee7b2c..c8af4cefd 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -86,6 +86,7 @@ void bli_hemm_front( side_t side, &b_local, beta, &c_local, - cntl ); + cntl, + &BLIS_SINGLE_THREADED ); } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index f3c7c31bc..d53f4649f 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -66,7 +66,8 @@ void bli_herk_blk_var1f( obj_t* a, // Pack A' (if instructed). bli_packm_int( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the m dimension. for ( i = 0; i < m_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_herk_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 4b4f77df3..d872980a6 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -73,7 +73,8 @@ void bli_herk_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -103,11 +104,13 @@ void bli_herk_blk_var2f( obj_t* a, // Pack A1' (if instructed). bli_packm_int( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1S, &c1S_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 436378e80..e3195ddba 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -66,7 +66,8 @@ void bli_herk_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_herk_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 303154caa..ef1af18a6 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -85,6 +85,7 @@ void bli_symm_front( side_t side, &b_local, beta, &c_local, - cntl ); + cntl, + &BLIS_SINGLE_THREADED ); } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index a21ff4876..5fbcb79e5 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -77,7 +77,8 @@ void bli_trmm_blk_var1f( obj_t* a, // Pack B (if instructed). bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -100,11 +101,13 @@ void bli_trmm_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 9ff3f0af7..d8669cf71 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -66,7 +66,8 @@ void bli_trmm_blk_var2b( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trmm_blk_var2b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 35a665ab6..7bcc529e8 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -66,7 +66,8 @@ void bli_trmm_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trmm_blk_var2f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 91921965c..1a52a76ca 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -66,7 +66,8 @@ void bli_trmm_blk_var3b( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trmm_blk_var3b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 2f3d6fd46..c0c65166f 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -66,7 +66,8 @@ void bli_trmm_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trmm_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 82158b707..26931e73a 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -68,7 +68,8 @@ void bli_trsm_blk_var1b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the remaining portion of the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -91,7 +92,8 @@ void bli_trsm_blk_var1b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index faa49d25f..cf3d8399d 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -67,7 +67,8 @@ void bli_trsm_blk_var1f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the remaining portion of the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -88,7 +89,8 @@ void bli_trsm_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 970fc9307..0920dc461 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -66,7 +66,8 @@ void bli_trsm_blk_var2b( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trsm_blk_var2b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index d1fe788da..562d8782a 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -66,7 +66,8 @@ void bli_trsm_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trsm_blk_var2f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index c9cde7bd7..3b9ae3478 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -66,7 +66,8 @@ void bli_trsm_blk_var3b( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trsm_blk_var3b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 71737dc3d..625909d23 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -66,7 +66,8 @@ void bli_trsm_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ) ); + cntl_sub_packm_c( cntl ), + &BLIS_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -89,11 +90,13 @@ void bli_trsm_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + cntl_sub_packm_a( cntl ), + &BLIS_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + cntl_sub_packm_b( cntl ), + &BLIS_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 752f23279..5c7ec7997 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -46,7 +46,8 @@ obj_t BLIS_MINUS_ONE_HALF; obj_t BLIS_MINUS_ONE; obj_t BLIS_MINUS_TWO; - +thrinfo_t BLIS_SINGLE_THREADED; +thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) { @@ -59,6 +60,8 @@ void bli_init( void ) bli_error_msgs_init(); bli_mem_init(); + + bli_setup_single_threaded_info( &BLIS_SINGLE_THREADED, &BLIS_SINGLE_COMM ); } void bli_finalize( void ) diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 01f8b5eba..6df0361d4 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -127,7 +127,7 @@ void bli_mem_acquire_m( siz_t req_size, // BEGIN CRITICAL SECTION - + _Pragma( "omp critical (mem)" ){ // Query the index of the contiguous memory block that resides at the // "top" of the pool. @@ -145,6 +145,7 @@ void bli_mem_acquire_m( siz_t req_size, // END CRITICAL SECTION + } // Query the size of the blocks in the pool so we can store it in the // mem_t object. @@ -198,6 +199,7 @@ void bli_mem_release( mem_t* mem ) // BEGIN CRITICAL SECTION + _Pragma( "omp critical (mem)" ){ // Increment the top of the memory pool. @@ -211,6 +213,7 @@ void bli_mem_release( mem_t* mem ) // END CRITICAL SECTION + } } diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c new file mode 100644 index 000000000..77d948b77 --- /dev/null +++ b/frame/base/bli_threading.c @@ -0,0 +1,228 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cleanup_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; + bli_destroy_lock( &communicator->barrier_lock ); +} +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) +{ + if( communicator == NULL ) return; + communicator->sent_object = NULL; + communicator->n_threads = n_threads; + communicator->barrier_sense = 0; + bli_init_lock( &communicator->barrier_lock ); + communicator->barrier_threads_arrived = 0; +} + +thread_comm_t* bli_create_communicator( dim_t n_threads ) +{ + thread_comm_t* comm = (thread_comm_t*) bli_malloc( sizeof(thread_comm_t) ); + bli_setup_communicator( comm, n_threads ); + return comm; +} + +void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_send ) +{ + if( communicator == NULL || communicator->n_threads == 1 ) return to_send; + + if( id == 0 ) communicator->sent_object = to_send; + + bli_barrier( communicator, id ); + void * object = communicator->sent_object; + bli_barrier( communicator, id ); + + return object; +} + +void bli_init_lock( lock_t* lock ) +{ + omp_init_lock( lock ); +} +void bli_destroy_lock( lock_t* lock ) +{ + omp_destroy_lock( lock ); +} +void bli_set_lock( lock_t* lock ) +{ + omp_set_lock( lock ); +} +void bli_unset_lock( lock_t* lock ) +{ + omp_unset_lock( lock ); +} + +//barrier routine taken from art of multicore programming or something +void bli_barrier( thread_comm_t* communicator, dim_t t_id ) +{ + if(communicator == NULL || communicator->n_threads == 1) + return; + bool_t my_sense = communicator->barrier_sense; + dim_t my_threads_arrived; + + bli_set_lock(&communicator->barrier_lock); + my_threads_arrived = communicator->barrier_threads_arrived + 1; + communicator->barrier_threads_arrived = my_threads_arrived; + bli_unset_lock(&communicator->barrier_lock); + + if( my_threads_arrived == communicator->n_threads ) { + + bli_set_lock(&communicator->barrier_lock); + communicator->barrier_threads_arrived = 0; + communicator->barrier_sense = !communicator->barrier_sense; + bli_unset_lock(&communicator->barrier_lock); + } + else { + volatile bool_t* listener = &communicator->barrier_sense; + while( *listener == my_sense ) {} + } +} + +//Recursively create thread communicators +void create_comms( dim_t* caucuses_at_level, dim_t n_levels, dim_t cur_level, + thread_comm_tree_t* parent, thread_comm_tree_t* leaves, dim_t global_id ) +{ + //Create a communicator + dim_t n_threads = 1; + for( dim_t i = cur_level; i < n_levels; i++) + n_threads *= caucuses_at_level[i]; + + + thread_comm_t* comm = bli_create_communicator( n_threads ); + thread_comm_tree_t* info; + if( cur_level == n_levels ) + { + leaves[global_id].parent = parent; + leaves[global_id].comm = comm; + return; + } + else + { + info = (thread_comm_tree_t*)bli_malloc(sizeof(thread_comm_tree_t)); + info->comm = comm; + info->parent = parent; + } + + //Now create child communicators + dim_t caucuses = caucuses_at_level[cur_level]; + for( dim_t i = 0; i < caucuses; i++) + create_comms( caucuses_at_level, n_levels, cur_level+1, info, leaves, global_id * caucuses + i); +} + +void bli_setup_thrinfo_t(thrinfo_t* thr, thread_comm_t* comm, dim_t comm_id, + thrinfo_t* caucus, dim_t n_caucuses, dim_t caucus_id ) +{ + thr->ocomm = comm; + thr->ocomm_id = comm_id; + thr->caucus = caucus; + thr->n_caucuses = n_caucuses; + thr->caucus_id = caucus_id; +} + +thrinfo_t* bli_create_thrinfo_t( thread_comm_t* comm, dim_t comm_id, thrinfo_t* caucus, dim_t n_caucuses, dim_t caucus_id ) +{ + thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) ); + thr->ocomm = comm; + thr->ocomm_id = comm_id; + thr->caucus = caucus; + thr->n_caucuses = n_caucuses; + thr->caucus_id = caucus_id; + return thr; +} + +void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ) +{ + bli_setup_communicator( comm, 1 ); + bli_setup_thrinfo_t( thr, comm, 0, NULL, 1, 0 ); + thr->caucus = thr; +} + +thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) +{ + //Calculate total number of threads + dim_t n_threads = 1; + for( dim_t i = 0; i < n_levels; i++) + n_threads *= caucuses_at_level[i]; + + //Create communicators + thread_comm_tree_t* comm_leaves = (thread_comm_tree_t*)bli_malloc( sizeof(thread_comm_tree_t) * n_threads); + create_comms( caucuses_at_level, n_levels, 0, NULL, comm_leaves, 0 ); + thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t*) * n_threads ); + + //Now create paths upwards + for( dim_t i = 0; i < n_threads; i++ ) + { + thread_comm_tree_t* comm_node = &comm_leaves[i]; + + //Setup thread info for the bottom-most level + thrinfo_t* bot = &BLIS_SINGLE_THREADED; //bli_create_thrinfo_t( comm_node->comm, 0, NULL, 1, 0 ); + + //Now build thread infos upwards + comm_node = comm_node->parent; + thrinfo_t* cur; + thrinfo_t* prev = bot; + for( dim_t j = 0; j < n_levels; j++ ) + { + if( j == n_levels - 1 ) + cur = &info_paths[i]; + else + cur = (thrinfo_t*)bli_malloc(sizeof(thrinfo_t)); + + dim_t caucus_size = prev->ocomm->n_threads; + dim_t ocomm_id = i % comm_node->comm->n_threads; + dim_t caucus_id = ocomm_id / caucus_size; + + bli_setup_thrinfo_t(cur, comm_node->comm, ocomm_id, + prev, caucuses_at_level[n_levels - j - 1], caucus_id ); + + cur = prev; + comm_node = comm_node->parent; + } + } + return info_paths; +} + +void bli_get_range( thrinfo_t* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) +{ + dim_t n_caucuses = thread->n_caucuses; + dim_t caucus_id = thread->caucus_id; + dim_t n_pt = size / n_caucuses; + n_pt = (n_pt * n_caucuses < size) ? n_pt + 1 : n_pt; + n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); + *start = caucus_id * n_pt; + *end = bli_min( *start + n_pt, size ); +} diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h new file mode 100644 index 000000000..35936b791 --- /dev/null +++ b/frame/base/bli_threading.h @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifndef BLIS_THREADING_H +#define BLIS_THREADING_H + +typedef omp_lock_t lock_t; + +struct thread_comm_s +{ + void* sent_object; + dim_t n_threads; + + bool_t barrier_sense; + lock_t barrier_lock; + dim_t barrier_threads_arrived; +}; +typedef struct thread_comm_s thread_comm_t; + +struct thread_comm_tree_s +{ + struct thread_comm_tree_s* parent; + thread_comm_t* comm; +}; +typedef struct thread_comm_tree_s thread_comm_tree_t; + + +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads ); +thread_comm_t* bli_create_communicator( dim_t n_threads ); + +void* bli_broadcast_structure( thread_comm_t* communicator, dim_t inside_id, void* to_send ); + +void bli_barrier( thread_comm_t* communicator, dim_t thread_id ); +void bli_set_lock( lock_t* lock ); +void bli_unset_lock( lock_t* lock ); +void bli_init_lock( lock_t* lock ); +void bli_destroy_lock( lock_t* lock ); + +/* + * Each thrinfo_t is a linked list. + * It represents a path through a thread communicator hierarchy. + * There is a 1:1 correspondence between leaf nodes and thrinfo_t + * + * When we hit a loop, we advance the linked list towards the bottom of the hierarchy + */ +struct thrinfo_s +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + + struct thrinfo_s* caucus; //my thread info for the caucus I am a part of + dim_t n_caucuses; //Number of distinct caucuses used to parallelize the loop + dim_t caucus_id; //Which caucus we are part of +}; +typedef struct thrinfo_s thrinfo_t; + +#define thread_comm( thread ) thread->ocomm +#define thread_caucus_comm( thread ) (thread->caucus->ocomm) + +#define thread_id( thread ) thread->ocomm_id +#define thread_num_threads( thread ) thread->ocomm->n_threads +#define thread_sub_caucus( thread ) thread->caucus +#define thread_caucus_id( thread ) thread->caucus_id +#define thread_num_caucuses( thread ) thread->n_caucuses +#define thread_am_chief( thread ) (thread->ocomm_id == 0) +#define thread_am_caucus_chief( thread ) (thread->caucus->ocomm_id == 0) + +#define thread_broadcast( thread, ptr ) bli_broadcast_structure( thread->ocomm, thread->ocomm_id, ptr ) +#define thread_caucus_broadcast( thread, ptr ) bli_broadcast_structure( thread->caucus->ocomm, thread->caucus->ocomm_id, ptr ) +#define thread_barrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) +#define thread_caucus_barrier( thread ) bli_barrier( thread->caucus->ocomm, thread->caucus->ocomm_id ) + +thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); +void bli_get_range( thrinfo_t* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); + +#endif diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 088e9c0f2..8f8440433 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -43,4 +43,6 @@ extern obj_t BLIS_MINUS_ONE_HALF; extern obj_t BLIS_MINUS_ONE; extern obj_t BLIS_MINUS_TWO; +extern thrinfo_t BLIS_SINGLE_THREADED; + #endif diff --git a/frame/include/blis.h b/frame/include/blis.h index 3bc5a3ca6..53112413d 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -65,6 +65,12 @@ extern "C" { #include "bli_type_defs.h" #include "bli_macro_defs.h" + +// -- Threading definitions -- +#include +#include "bli_threading.h" + +// -- Constant definitions -- #include "bli_extern_defs.h" diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 25f44dc69..fc73eea86 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -221,8 +221,8 @@ void libblis_test_gemm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap ); - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index e019a264c..f40d54eb5 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -251,10 +251,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap ); + bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); // Create subpartitions from the a and b panels. @@ -268,7 +268,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); time = bli_clock(); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index bb003fd19..2262b0a0d 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -217,14 +217,14 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap ); + bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp ); + bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); bli_copym( &c_save, &c ); From 6193d9ceea552e67170dba45abde04c64271c705 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Feb 2014 14:09:19 -0600 Subject: [PATCH 02/42] Fixed bug in thread trees --- frame/1m/packm/bli_packm_check.c | 2 -- frame/3/gemm/bli_gemm_blk_var1f.c | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index 500a5c85d..1501f475f 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -66,8 +66,6 @@ void bli_packm_int_check( obj_t* a, bli_check_error_code( e_val ); e_val = bli_check_floating_object( p ); - if(e_val == BLIS_EXPECTED_FLOATING_POINT_DATATYPE) - printf("HI\n"); bli_check_error_code( e_val ); // Check object dimensions. diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 5024e8121..568f176d3 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -100,9 +100,8 @@ void bli_gemm_blk_var1f( obj_t* a, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); - if( !thread_am_caucus_chief( thread ) ) - printf("DOGS\n"); - // Initialize objects for packing A1 and C1. + + // Initialize objects for packing A1 and C1. if( thread_am_caucus_chief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); From bfe214b633765ed40b57b330fbb84c332663aa40 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Feb 2014 15:53:10 -0600 Subject: [PATCH 03/42] Fixed bug with parallel packing, and bug with allocating an array of thread infos In packm variant 1, the variable p_begin was incremented each iteration, causing a dependency. This dependeny was removed, allowing each iteration to be executed in parallel. Somewhere in bli_threading.c, I was allocating an array of pointers instead of an array of structs. --- frame/1m/packm/bli_packm_blk_var1.c | 10 ++++------ frame/base/bli_threading.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 93c164a6d..aa0c0e56b 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -187,9 +187,6 @@ void PASTEMAC(ch,varname )( \ to pack it. */ \ if ( bli_is_zeros( uploc ) && \ bli_is_triangular( strucc ) ) return; \ -\ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ \ /* Extract the conjugation bit from the transposition argument. */ \ conjc = bli_extract_conj( transc ); \ @@ -266,6 +263,9 @@ void PASTEMAC(ch,varname )( \ } \ \ p_begin = p_cast; \ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ +\ \ for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ @@ -274,6 +274,7 @@ void PASTEMAC(ch,varname )( \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*ps_p; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -389,9 +390,6 @@ void PASTEMAC(ch,varname )( \ BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \ p_inc = ldp * panel_len_max_i; \ } \ -\ -\ - p_begin += p_inc; \ } \ \ \ diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 77d948b77..55aa5ff2e 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -181,7 +181,7 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) //Create communicators thread_comm_tree_t* comm_leaves = (thread_comm_tree_t*)bli_malloc( sizeof(thread_comm_tree_t) * n_threads); create_comms( caucuses_at_level, n_levels, 0, NULL, comm_leaves, 0 ); - thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t*) * n_threads ); + thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t) * n_threads ); //Now create paths upwards for( dim_t i = 0; i < n_threads; i++ ) @@ -209,7 +209,7 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) bli_setup_thrinfo_t(cur, comm_node->comm, ocomm_id, prev, caucuses_at_level[n_levels - j - 1], caucus_id ); - cur = prev; + prev = cur; comm_node = comm_node->parent; } } From e4738c48e00b89391d9baa1fd0aa62d1ea2f95e6 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Feb 2014 16:29:46 -0600 Subject: [PATCH 04/42] Added support for parallelism in gemm micro-kernel --- frame/3/gemm/bli_gemm_ker_var2.c | 35 +++++++++++++++++--------------- frame/3/gemm/bli_gemm_ker_var2.h | 3 ++- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index ab5585d7c..4c01c9841 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -45,7 +45,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); @@ -118,7 +119,8 @@ void bli_gemm_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -134,7 +136,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -214,18 +217,21 @@ void PASTEMAC(ch,varname)( \ bli_auxinfo_set_ps_a( ps_a, aux ); \ bli_auxinfo_set_ps_b( ps_b, aux ); \ \ - b1 = b_cast; \ - c1 = c_cast; \ + thrinfo_t* caucus = thread_sub_caucus( thread ); \ + dim_t l2_num_threads = thread_num_caucuses( thread ); \ + dim_t l2_thread_id = thread_caucus_id( thread ); \ + dim_t l1_num_threads = thread_num_caucuses( caucus ); \ + dim_t l1_thread_id = thread_caucus_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = l2_thread_id; j < n_iter; j += l2_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ -\ - a1 = a_cast; \ - c11 = c1; \ + \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -233,9 +239,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = l1_thread_id; i < m_iter; i += l1_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ @@ -283,13 +292,7 @@ void PASTEMAC(ch,varname)( \ beta_cast, \ c11, rs_c, cs_c ); \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ diff --git a/frame/3/gemm/bli_gemm_ker_var2.h b/frame/3/gemm/bli_gemm_ker_var2.h index e41ee44be..71248819b 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.h +++ b/frame/3/gemm/bli_gemm_ker_var2.h @@ -58,7 +58,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( gemm_ker_var2 ) From 2e727a025a8f796d2b6bd14f489d0ee72e7d1fc7 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 10 Mar 2014 15:14:33 -0500 Subject: [PATCH 05/42] Modifying the thread info data structures This change makes each operation have its own thread info type, allowing more fine control of threading in operations that have different types of suboperations --- frame/1m/packm/bli_packm_blk_var1.c | 6 +-- frame/1m/packm/bli_packm_blk_var1.h | 4 +- frame/1m/packm/bli_packm_int.c | 4 +- frame/1m/packm/bli_packm_int.h | 2 +- frame/2/gemv/bli_gemv_blk_var1.c | 2 +- frame/2/gemv/bli_gemv_blk_var2.c | 2 +- frame/2/ger/bli_ger_blk_var1.c | 2 +- frame/2/ger/bli_ger_blk_var2.c | 2 +- frame/2/hemv/bli_hemv_blk_var1.c | 2 +- frame/2/hemv/bli_hemv_blk_var2.c | 2 +- frame/2/hemv/bli_hemv_blk_var3.c | 2 +- frame/2/hemv/bli_hemv_blk_var4.c | 2 +- frame/2/her/bli_her_blk_var1.c | 2 +- frame/2/her/bli_her_blk_var2.c | 2 +- frame/2/her2/bli_her2_blk_var1.c | 2 +- frame/2/her2/bli_her2_blk_var2.c | 2 +- frame/2/her2/bli_her2_blk_var3.c | 2 +- frame/2/her2/bli_her2_blk_var4.c | 2 +- frame/2/trmv/bli_trmv_l_blk_var1.c | 2 +- frame/2/trmv/bli_trmv_l_blk_var2.c | 2 +- frame/2/trmv/bli_trmv_u_blk_var1.c | 2 +- frame/2/trmv/bli_trmv_u_blk_var2.c | 2 +- frame/2/trsv/bli_trsv_l_blk_var1.c | 2 +- frame/2/trsv/bli_trsv_l_blk_var2.c | 2 +- frame/2/trsv/bli_trsv_u_blk_var1.c | 2 +- frame/2/trsv/bli_trsv_u_blk_var2.c | 2 +- frame/3/gemm/bli_gemm.h | 1 - frame/3/gemm/bli_gemm_blk_var1f.c | 36 +++++++++--------- frame/3/gemm/bli_gemm_blk_var1f.h | 2 +- frame/3/gemm/bli_gemm_blk_var2f.c | 36 +++++++++--------- frame/3/gemm/bli_gemm_blk_var2f.h | 2 +- frame/3/gemm/bli_gemm_blk_var3f.c | 47 +++++++++++------------ frame/3/gemm/bli_gemm_blk_var3f.h | 2 +- frame/3/gemm/bli_gemm_cntl.c | 6 +-- frame/3/gemm/bli_gemm_cntl.h | 2 +- frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_int.c | 4 +- frame/3/gemm/bli_gemm_int.h | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 2 +- frame/3/gemm/bli_gemm_ker_var2.h | 2 +- frame/3/gemm/bli_gemm_ker_var5.c | 2 +- frame/3/gemm/bli_gemm_ker_var5.h | 2 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/herk/bli_herk_blk_var1f.c | 6 +-- frame/3/herk/bli_herk_blk_var2f.c | 6 +-- frame/3/herk/bli_herk_blk_var3f.c | 6 +-- frame/3/symm/bli_symm_front.c | 2 +- frame/3/trmm/bli_trmm_blk_var1f.c | 6 +-- frame/3/trmm/bli_trmm_blk_var2b.c | 6 +-- frame/3/trmm/bli_trmm_blk_var2f.c | 6 +-- frame/3/trmm/bli_trmm_blk_var3b.c | 6 +-- frame/3/trmm/bli_trmm_blk_var3f.c | 6 +-- frame/3/trsm/bli_trsm_blk_var1b.c | 4 +- frame/3/trsm/bli_trsm_blk_var1f.c | 4 +- frame/3/trsm/bli_trsm_blk_var2b.c | 6 +-- frame/3/trsm/bli_trsm_blk_var2f.c | 6 +-- frame/3/trsm/bli_trsm_blk_var3b.c | 6 +-- frame/3/trsm/bli_trsm_blk_var3f.c | 6 +-- frame/base/bli_init.c | 6 ++- frame/base/bli_threading.c | 54 +++++++++++++-------------- frame/base/bli_threading.h | 58 +++++++++++++---------------- frame/include/bli_extern_defs.h | 3 +- 62 files changed, 204 insertions(+), 211 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 93c164a6d..6cac53938 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -53,7 +53,7 @@ typedef void (*FUNCPTR_T)( void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, dim_t pd_p, inc_t ps_p, - thrinfo_t* thread + packm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); @@ -61,7 +61,7 @@ static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); void bli_packm_blk_var1( obj_t* c, obj_t* p, - thrinfo_t* t ) + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -144,7 +144,7 @@ void PASTEMAC(ch,varname )( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ - thrinfo_t* thread \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index a15173205..e4cd44e78 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -34,7 +34,7 @@ void bli_packm_blk_var1( obj_t* c, obj_t* p, - thrinfo_t* t ); + packm_thrinfo_t* t ); #undef GENTPROT @@ -57,7 +57,7 @@ void PASTEMAC(ch,varname)( \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ - thrinfo_t* thread \ + packm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( packm_blk_var1 ) diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 3d9adc203..db7ea8f64 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -38,7 +38,7 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* p, - thrinfo_t* t ); + packm_thrinfo_t* t ); static FUNCPTR_T vars[6][3] = { @@ -54,7 +54,7 @@ static FUNCPTR_T vars[6][3] = void bli_packm_int( obj_t* a, obj_t* p, packm_t* cntl, - thrinfo_t* thread ) + packm_thrinfo_t* thread ) { varnum_t n; impl_t i; diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 3dc5aa73b..1e6a122ac 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -35,5 +35,5 @@ void bli_packm_int( obj_t* a, obj_t* p, packm_t* cntl, - thrinfo_t* thread ); + packm_thrinfo_t* thread ); diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/bli_gemv_blk_var1.c index 5f66e1c33..4f95118c2 100644 --- a/frame/2/gemv/bli_gemv_blk_var1.c +++ b/frame/2/gemv/bli_gemv_blk_var1.c @@ -77,7 +77,7 @@ void bli_gemv_blk_var1( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/bli_gemv_blk_var2.c index 0fb05bd28..4d6fdba0d 100644 --- a/frame/2/gemv/bli_gemv_blk_var2.c +++ b/frame/2/gemv/bli_gemv_blk_var2.c @@ -82,7 +82,7 @@ void bli_gemv_blk_var2( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/bli_ger_blk_var1.c index 7944af2dd..e22c69fd9 100644 --- a/frame/2/ger/bli_ger_blk_var1.c +++ b/frame/2/ger/bli_ger_blk_var1.c @@ -76,7 +76,7 @@ void bli_ger_blk_var1( obj_t* alpha, // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x( cntl ) ); diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/bli_ger_blk_var2.c index e5040aeb2..a1a5eeb45 100644 --- a/frame/2/ger/bli_ger_blk_var2.c +++ b/frame/2/ger/bli_ger_blk_var2.c @@ -76,7 +76,7 @@ void bli_ger_blk_var2( obj_t* alpha, // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntl_sub_packv_y( cntl ) ); diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/bli_hemv_blk_var1.c index 1e711aada..5beb1be75 100644 --- a/frame/2/hemv/bli_hemv_blk_var1.c +++ b/frame/2/hemv/bli_hemv_blk_var1.c @@ -107,7 +107,7 @@ void bli_hemv_blk_var1( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/bli_hemv_blk_var2.c index 0d3de9773..371f53d82 100644 --- a/frame/2/hemv/bli_hemv_blk_var2.c +++ b/frame/2/hemv/bli_hemv_blk_var2.c @@ -110,7 +110,7 @@ void bli_hemv_blk_var2( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/bli_hemv_blk_var3.c index 934eb0b74..072706300 100644 --- a/frame/2/hemv/bli_hemv_blk_var3.c +++ b/frame/2/hemv/bli_hemv_blk_var3.c @@ -107,7 +107,7 @@ void bli_hemv_blk_var3( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/bli_hemv_blk_var4.c index d1fe06018..d4fc17324 100644 --- a/frame/2/hemv/bli_hemv_blk_var4.c +++ b/frame/2/hemv/bli_hemv_blk_var4.c @@ -110,7 +110,7 @@ void bli_hemv_blk_var4( conj_t conjh, // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/bli_her_blk_var1.c index 6501fd708..7121ff0b1 100644 --- a/frame/2/her/bli_her_blk_var1.c +++ b/frame/2/her/bli_her_blk_var1.c @@ -91,7 +91,7 @@ void bli_her_blk_var1( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/bli_her_blk_var2.c index 5579d0009..b9bf2154c 100644 --- a/frame/2/her/bli_her_blk_var2.c +++ b/frame/2/her/bli_her_blk_var2.c @@ -91,7 +91,7 @@ void bli_her_blk_var2( conj_t conjh, // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/bli_her2_blk_var1.c index 831ef10f1..645b9de79 100644 --- a/frame/2/her2/bli_her2_blk_var1.c +++ b/frame/2/her2/bli_her2_blk_var1.c @@ -102,7 +102,7 @@ void bli_her2_blk_var1( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/bli_her2_blk_var2.c index 59fd7a9c4..d6876de3e 100644 --- a/frame/2/her2/bli_her2_blk_var2.c +++ b/frame/2/her2/bli_her2_blk_var2.c @@ -105,7 +105,7 @@ void bli_her2_blk_var2( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/bli_her2_blk_var3.c index eb0c15a53..7e84b5830 100644 --- a/frame/2/her2/bli_her2_blk_var3.c +++ b/frame/2/her2/bli_her2_blk_var3.c @@ -105,7 +105,7 @@ void bli_her2_blk_var3( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/bli_her2_blk_var4.c index 252d969fc..4760606f9 100644 --- a/frame/2/her2/bli_her2_blk_var4.c +++ b/frame/2/her2/bli_her2_blk_var4.c @@ -102,7 +102,7 @@ void bli_her2_blk_var4( conj_t conjh, // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntl_sub_packm_c11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/bli_trmv_l_blk_var1.c index 431618cff..5550e9ee9 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var1.c +++ b/frame/2/trmv/bli_trmv_l_blk_var1.c @@ -81,7 +81,7 @@ void bli_trmv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/bli_trmv_l_blk_var2.c index d78427051..1db28eb11 100644 --- a/frame/2/trmv/bli_trmv_l_blk_var2.c +++ b/frame/2/trmv/bli_trmv_l_blk_var2.c @@ -81,7 +81,7 @@ void bli_trmv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/bli_trmv_u_blk_var1.c index c4493310c..1e82157af 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var1.c +++ b/frame/2/trmv/bli_trmv_u_blk_var1.c @@ -81,7 +81,7 @@ void bli_trmv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/bli_trmv_u_blk_var2.c index 36048a91e..0c9ea6d0b 100644 --- a/frame/2/trmv/bli_trmv_u_blk_var2.c +++ b/frame/2/trmv/bli_trmv_u_blk_var2.c @@ -81,7 +81,7 @@ void bli_trmv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/bli_trsv_l_blk_var1.c index c98537f71..b7b7e382a 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var1.c +++ b/frame/2/trsv/bli_trsv_l_blk_var1.c @@ -86,7 +86,7 @@ void bli_trsv_l_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/bli_trsv_l_blk_var2.c index 3f258ae57..5e2718cb2 100644 --- a/frame/2/trsv/bli_trsv_l_blk_var2.c +++ b/frame/2/trsv/bli_trsv_l_blk_var2.c @@ -86,7 +86,7 @@ void bli_trsv_l_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/bli_trsv_u_blk_var1.c index ce3a96eff..6f6c55558 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var1.c +++ b/frame/2/trsv/bli_trsv_u_blk_var1.c @@ -86,7 +86,7 @@ void bli_trsv_u_blk_var1( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/bli_trsv_u_blk_var2.c index 7f2a8e443..7611e53dc 100644 --- a/frame/2/trsv/bli_trsv_u_blk_var2.c +++ b/frame/2/trsv/bli_trsv_u_blk_var2.c @@ -86,7 +86,7 @@ void bli_trsv_u_blk_var2( obj_t* alpha, // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntl_sub_packm_a11( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntl_sub_packv_x1( cntl ) ); diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index 64a97777d..b6981135b 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -47,7 +47,6 @@ #include "bli_gemm_ref_mxn.h" - // // Prototype object-based interface. // diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 568f176d3..368c303cf 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -38,7 +38,7 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { //The s is for "lives on the stack" obj_t b_pack_s; @@ -53,7 +53,7 @@ void bli_gemm_blk_var1f( obj_t* a, dim_t b_alg; dim_t m_trans; - if( thread_am_chief( thread ) ) { + if( thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, @@ -65,20 +65,20 @@ void bli_gemm_blk_var1f( obj_t* a, c, cntl_sub_scalm( cntl ) ); } - b_pack = thread_broadcast( thread, &b_pack_s ); + b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize objects passed into bli_packm_init for A and C - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } - a1_pack = thread_caucus_broadcast( thread, &a1_pack_s ); - c1_pack = thread_caucus_broadcast( thread, &c1_pack_s ); + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack B (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), - thread ); + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); @@ -102,26 +102,26 @@ void bli_gemm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); // Packing must be done before computation. - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, @@ -130,25 +130,25 @@ void bli_gemm_blk_var1f( obj_t* a, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_gemm( thread ) ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ) ); } //Barrier to make sure unpacking is done before next iteration's packing of C //Somehow, we'd like to make this a noop if packing isn't done. - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - thread_barrier( thread ); - if( thread_am_chief( thread ) ) + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) bli_obj_release_pack( b_pack ); - if( thread_am_caucus_chief( thread ) ){ + if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( c1_pack ); } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.h b/frame/3/gemm/bli_gemm_blk_var1f.h index c7ff240b0..99548ac12 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.h +++ b/frame/3/gemm/bli_gemm_blk_var1f.h @@ -36,5 +36,5 @@ void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index c438d4607..e521b769f 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -38,7 +38,7 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; @@ -53,7 +53,7 @@ void bli_gemm_blk_var2f( obj_t* a, dim_t n_trans; - if( thread_am_chief( thread ) ) { + if( thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, @@ -64,20 +64,20 @@ void bli_gemm_blk_var2f( obj_t* a, c, cntl_sub_scalm( cntl ) ); } - a_pack = thread_broadcast( thread, &a_pack_s ); + a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize all pack objects that are passed into packm_init(). - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } - b1_pack = thread_caucus_broadcast( thread, &b1_pack_s ); - c1_pack = thread_caucus_broadcast( thread, &c1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntl_sub_packm_a( cntl ), - thread ); + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); @@ -101,26 +101,26 @@ void bli_gemm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); // Packing must be done before computation - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, @@ -129,25 +129,25 @@ void bli_gemm_blk_var2f( obj_t* a, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), - thread_sub_caucus( thread) ); + gemm_thread_sub_gemm( thread) ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ) ); } //Barrier to make sure unpacking is done before next iteration's packing of C //Somehow, we'd like to make this a noop if packing isn't done. - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - thread_barrier( thread ); - if( thread_am_chief( thread ) ) + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) bli_obj_release_pack( a_pack ); - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_obj_release_pack( b1_pack ); bli_obj_release_pack( c1_pack ); } diff --git a/frame/3/gemm/bli_gemm_blk_var2f.h b/frame/3/gemm/bli_gemm_blk_var2f.h index 488966cab..73dea73b8 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.h +++ b/frame/3/gemm/bli_gemm_blk_var2f.h @@ -36,5 +36,5 @@ void bli_gemm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index b1378de59..8af9837a0 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -38,7 +38,7 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; @@ -52,7 +52,7 @@ void bli_gemm_blk_var3f( obj_t* a, dim_t b_alg; dim_t k_trans; - if( thread_am_chief( thread ) ){ + if( thread_am_ochief( thread ) ){ // Initialize object for packing C bli_obj_init_pack( &c_pack_s ); bli_packm_init( c, &c_pack_s, @@ -63,20 +63,20 @@ void bli_gemm_blk_var3f( obj_t* a, c, cntl_sub_scalm( cntl ) ); } - c_pack = thread_broadcast( thread, &c_pack_s ); + c_pack = thread_obroadcast( thread, &c_pack_s ); // Initialize pack objects for A and B that are passed into packm_init(). - if( thread_am_caucus_chief( thread ) ){ + if( thread_am_ichief( thread ) ){ bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &b1_pack_s ); } - a1_pack = thread_caucus_broadcast( thread, &a1_pack_s ); - b1_pack = thread_caucus_broadcast( thread, &b1_pack_s ); + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); // Pack C (if instructed). bli_packm_int( c, c_pack, cntl_sub_packm_c( cntl ), - thread ); + gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -100,26 +100,34 @@ void bli_gemm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - if( thread_am_caucus_chief( thread ) ) { + if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); } - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - thread_sub_caucus( thread ) ); + gemm_thread_sub_ipackm( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c_pack is a local obj_t, we can simply overwrite the + // internal beta scalar with BLIS_ONE once it has been used in the + // first iteration. + if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); // Packing must be done before computation. - thread_caucus_barrier( thread ); + thread_ibarrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, @@ -128,15 +136,8 @@ void bli_gemm_blk_var3f( obj_t* a, &BLIS_ONE, c_pack, cntl_sub_gemm( cntl ), - thread_sub_caucus( thread) ); + gemm_thread_sub_gemm( thread) ); - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( c_pack ); } // Unpack C (if C was packed). @@ -145,12 +146,12 @@ void bli_gemm_blk_var3f( obj_t* a, // If any packing buffers were acquired within packm, release them back // to the memory manager. - thread_barrier( thread ); - if( thread_am_caucus_chief( thread ) ){ + thread_obarrier( thread ); + if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); } - if( thread_am_chief( thread ) ) + if( thread_am_ochief( thread ) ) bli_obj_release_pack( c_pack ); } diff --git a/frame/3/gemm/bli_gemm_blk_var3f.h b/frame/3/gemm/bli_gemm_blk_var3f.h index 8bbbde559..cdd655c2f 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.h +++ b/frame/3/gemm/bli_gemm_blk_var3f.h @@ -36,5 +36,5 @@ void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 19458e52e..f8d56e684 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,11 +55,11 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; -dim_t gemm_caucuses_at_level[5] = {1, 1, 2, 1, 1}; +dim_t gemm_caucuses_at_level[5] = {2, 1, 1, 1, 1}; -thrinfo_t* bli_gemm_cntl_get_thrinfos() +gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() { - return bli_create_thread_info( gemm_caucuses_at_level, 5 ); + return bli_create_gemm_thrinfo_paths( gemm_caucuses_at_level, 5 ); } void bli_gemm_cntl_free_thrinfos(thrinfo_t* tofree) diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 6a94bf892..136f89ef5 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -65,4 +65,4 @@ gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, gemm_t* sub_gemm, unpackm_t* sub_unpack_c ); -thrinfo_t* bli_gemm_cntl_get_thrinfos(); +gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos(); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index c1d48cc27..af93b6079 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -74,7 +74,7 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); + gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); dim_t n_threads = thread_num_threads( (&infos[0]) ); // Invoke the internal back-end. diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 44c4bdca1..5218ab8c0 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -40,7 +40,7 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); static FUNCPTR_T vars[6][3] = { @@ -59,7 +59,7 @@ void bli_gemm_int( obj_t* alpha, obj_t* beta, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index cc2e4a929..bfefe30c7 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -38,5 +38,5 @@ void bli_gemm_int( obj_t* alpha, obj_t* beta, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index ab5585d7c..f70016b24 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -55,7 +55,7 @@ void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.h b/frame/3/gemm/bli_gemm_ker_var2.h index e41ee44be..a1627df40 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.h +++ b/frame/3/gemm/bli_gemm_ker_var2.h @@ -40,7 +40,7 @@ void bli_gemm_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); // diff --git a/frame/3/gemm/bli_gemm_ker_var5.c b/frame/3/gemm/bli_gemm_ker_var5.c index d89f0ccd3..33d245780 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.c +++ b/frame/3/gemm/bli_gemm_ker_var5.c @@ -55,7 +55,7 @@ void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ) + gemm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); diff --git a/frame/3/gemm/bli_gemm_ker_var5.h b/frame/3/gemm/bli_gemm_ker_var5.h index dee007158..52a237bbc 100644 --- a/frame/3/gemm/bli_gemm_ker_var5.h +++ b/frame/3/gemm/bli_gemm_ker_var5.h @@ -40,7 +40,7 @@ void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, - thrinfo_t* thread ); + gemm_thrinfo_t* thread ); // diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index c8af4cefd..a99869dd2 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -87,6 +87,6 @@ void bli_hemm_front( side_t side, beta, &c_local, cntl, - &BLIS_SINGLE_THREADED ); + &BLIS_GEMM_SINGLE_THREADED ); } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index d53f4649f..fcb8afbb7 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -67,7 +67,7 @@ void bli_herk_blk_var1f( obj_t* a, // Pack A' (if instructed). bli_packm_int( ah, &ah_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the m dimension. for ( i = 0; i < m_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_herk_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index d872980a6..e09b48810 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -74,7 +74,7 @@ void bli_herk_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -105,12 +105,12 @@ void bli_herk_blk_var2f( obj_t* a, // Pack A1' (if instructed). bli_packm_int( &ah1, &ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1S, &c1S_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index e3195ddba..cb3b323e2 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -67,7 +67,7 @@ void bli_herk_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_herk_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &ah1, &ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index ef1af18a6..5043f1355 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -86,6 +86,6 @@ void bli_symm_front( side_t side, beta, &c_local, cntl, - &BLIS_SINGLE_THREADED ); + &BLIS_GEMM_SINGLE_THREADED ); } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 5fbcb79e5..23238a089 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -78,7 +78,7 @@ void bli_trmm_blk_var1f( obj_t* a, // Pack B (if instructed). bli_packm_int( b, &b_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -102,12 +102,12 @@ void bli_trmm_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index d8669cf71..0c98da8e6 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -67,7 +67,7 @@ void bli_trmm_blk_var2b( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trmm_blk_var2b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 7bcc529e8..14571322b 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -67,7 +67,7 @@ void bli_trmm_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trmm_blk_var2f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 1a52a76ca..11b3dc551 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -67,7 +67,7 @@ void bli_trmm_blk_var3b( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trmm_blk_var3b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index c0c65166f..59050423c 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -67,7 +67,7 @@ void bli_trmm_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trmm_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 26931e73a..6d4681f35 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -69,7 +69,7 @@ void bli_trsm_blk_var1b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( b, &b_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the remaining portion of the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -93,7 +93,7 @@ void bli_trsm_blk_var1b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index cf3d8399d..8177e183b 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -68,7 +68,7 @@ void bli_trsm_blk_var1f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( b, &b_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the remaining portion of the m dimension. for ( i = offA; i < m_trans; i += b_alg ) @@ -90,7 +90,7 @@ void bli_trsm_blk_var1f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 0920dc461..724b88f2d 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -67,7 +67,7 @@ void bli_trsm_blk_var2b( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trsm_blk_var2b( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 562d8782a..5e57ecee8 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -67,7 +67,7 @@ void bli_trsm_blk_var2f( obj_t* a, // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trsm_blk_var2f( obj_t* a, // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack C1 (if instructed). bli_packm_int( &c1, &c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 3b9ae3478..f65a75b78 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -67,7 +67,7 @@ void bli_trsm_blk_var3b( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trsm_blk_var3b( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 625909d23..a6bc85ed0 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -67,7 +67,7 @@ void bli_trsm_blk_var3f( obj_t* a, // Pack C (if instructed). bli_packm_int( c, &c_pack, cntl_sub_packm_c( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) @@ -91,12 +91,12 @@ void bli_trsm_blk_var3f( obj_t* a, // Pack A1 (if instructed). bli_packm_int( &a1, &a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Pack B1 (if instructed). bli_packm_int( &b1, &b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_SINGLE_THREADED ); + &BLIS_PACKM_SINGLE_THREADED ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 5c7ec7997..a72eb2da3 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -46,7 +46,9 @@ obj_t BLIS_MINUS_ONE_HALF; obj_t BLIS_MINUS_ONE; obj_t BLIS_MINUS_TWO; -thrinfo_t BLIS_SINGLE_THREADED; +packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; +gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; + thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) @@ -61,7 +63,7 @@ void bli_init( void ) bli_mem_init(); - bli_setup_single_threaded_info( &BLIS_SINGLE_THREADED, &BLIS_SINGLE_COMM ); + //bli_setup_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED, &BLIS_SINGLE_COMM ); } void bli_finalize( void ) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 77d948b77..f56c275d0 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -39,6 +39,7 @@ void bli_cleanup_communicator( thread_comm_t* communicator ) if( communicator == NULL ) return; bli_destroy_lock( &communicator->barrier_lock ); } + void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) { if( communicator == NULL ) return; @@ -111,7 +112,7 @@ void bli_barrier( thread_comm_t* communicator, dim_t t_id ) while( *listener == my_sense ) {} } } - +/* //Recursively create thread communicators void create_comms( dim_t* caucuses_at_level, dim_t n_levels, dim_t cur_level, thread_comm_tree_t* parent, thread_comm_tree_t* leaves, dim_t global_id ) @@ -142,35 +143,29 @@ void create_comms( dim_t* caucuses_at_level, dim_t n_levels, dim_t cur_level, for( dim_t i = 0; i < caucuses; i++) create_comms( caucuses_at_level, n_levels, cur_level+1, info, leaves, global_id * caucuses + i); } - -void bli_setup_thrinfo_t(thrinfo_t* thr, thread_comm_t* comm, dim_t comm_id, - thrinfo_t* caucus, dim_t n_caucuses, dim_t caucus_id ) +*/ +thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) { - thr->ocomm = comm; - thr->ocomm_id = comm_id; - thr->caucus = caucus; - thr->n_caucuses = n_caucuses; - thr->caucus_id = caucus_id; -} -thrinfo_t* bli_create_thrinfo_t( thread_comm_t* comm, dim_t comm_id, thrinfo_t* caucus, dim_t n_caucuses, dim_t caucus_id ) -{ thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) ); - thr->ocomm = comm; - thr->ocomm_id = comm_id; - thr->caucus = caucus; - thr->n_caucuses = n_caucuses; - thr->caucus_id = caucus_id; + bli_setup_thread_info( thr, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); return thr; } -void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ) +void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) { - bli_setup_communicator( comm, 1 ); - bli_setup_thrinfo_t( thr, comm, 0, NULL, 1, 0 ); - thr->caucus = thr; + thr->ocomm = ocomm; + thr->ocomm_id = ocomm_id; + thr->icomm = icomm; + thr->icomm_id = icomm_id; + + thr->n_way = n_way; + thr->work_id = work_id; } +/* thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) { //Calculate total number of threads @@ -209,20 +204,21 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) bli_setup_thrinfo_t(cur, comm_node->comm, ocomm_id, prev, caucuses_at_level[n_levels - j - 1], caucus_id ); - cur = prev; + prev = cur; comm_node = comm_node->parent; } } return info_paths; } - -void bli_get_range( thrinfo_t* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) +*/ +void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) { - dim_t n_caucuses = thread->n_caucuses; - dim_t caucus_id = thread->caucus_id; - dim_t n_pt = size / n_caucuses; - n_pt = (n_pt * n_caucuses < size) ? n_pt + 1 : n_pt; + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + dim_t n_pt = size / n_way; + n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt; n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); - *start = caucus_id * n_pt; + *start = work_id * n_pt; *end = bli_min( *start + n_pt, size ); } diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index 35936b791..9fbfcf21b 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -34,6 +34,7 @@ #ifndef BLIS_THREADING_H #define BLIS_THREADING_H + typedef omp_lock_t lock_t; struct thread_comm_s @@ -47,14 +48,6 @@ struct thread_comm_s }; typedef struct thread_comm_s thread_comm_t; -struct thread_comm_tree_s -{ - struct thread_comm_tree_s* parent; - thread_comm_t* comm; -}; -typedef struct thread_comm_tree_s thread_comm_tree_t; - - void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads ); thread_comm_t* bli_create_communicator( dim_t n_threads ); @@ -66,42 +59,43 @@ void bli_unset_lock( lock_t* lock ); void bli_init_lock( lock_t* lock ); void bli_destroy_lock( lock_t* lock ); -/* - * Each thrinfo_t is a linked list. - * It represents a path through a thread communicator hierarchy. - * There is a 1:1 correspondence between leaf nodes and thrinfo_t - * - * When we hit a loop, we advance the linked list towards the bottom of the hierarchy - */ struct thrinfo_s { thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm - struct thrinfo_s* caucus; //my thread info for the caucus I am a part of - dim_t n_caucuses; //Number of distinct caucuses used to parallelize the loop - dim_t caucus_id; //Which caucus we are part of + dim_t n_way; //Number of distinct used to parallelize the loop + dim_t work_id; //What we're working on }; typedef struct thrinfo_s thrinfo_t; -#define thread_comm( thread ) thread->ocomm -#define thread_caucus_comm( thread ) (thread->caucus->ocomm) +#define thread_ocomm( thread ) thread->ocomm +#define thread_icomm( thread ) (thread->icomm) #define thread_id( thread ) thread->ocomm_id #define thread_num_threads( thread ) thread->ocomm->n_threads -#define thread_sub_caucus( thread ) thread->caucus -#define thread_caucus_id( thread ) thread->caucus_id -#define thread_num_caucuses( thread ) thread->n_caucuses -#define thread_am_chief( thread ) (thread->ocomm_id == 0) -#define thread_am_caucus_chief( thread ) (thread->caucus->ocomm_id == 0) -#define thread_broadcast( thread, ptr ) bli_broadcast_structure( thread->ocomm, thread->ocomm_id, ptr ) -#define thread_caucus_broadcast( thread, ptr ) bli_broadcast_structure( thread->caucus->ocomm, thread->caucus->ocomm_id, ptr ) -#define thread_barrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) -#define thread_caucus_barrier( thread ) bli_barrier( thread->caucus->ocomm, thread->caucus->ocomm_id ) +#define thread_work_id( thread ) thread->work_id +#define thread_n_way( thread ) thread->n_way +#define thread_am_ochief( thread ) (thread->ocomm_id == 0) +#define thread_am_ichief( thread ) (thread->icomm_id == 0) -thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); -void bli_get_range( thrinfo_t* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ); -void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); +#define thread_obroadcast( thread, ptr ) bli_broadcast_structure( thread->ocomm, thread->ocomm_id, ptr ) +#define thread_ibroadcast( thread, ptr ) bli_broadcast_structure( thread->icomm, thread->icomm_id, ptr ) +#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) +#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) + +void bli_get_range( void* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ); +thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +//void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); +//thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); + +#include "bli_packm_threading.h" +#include "bli_gemm_threading.h" #endif diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 8f8440433..e916a17bd 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -43,6 +43,7 @@ extern obj_t BLIS_MINUS_ONE_HALF; extern obj_t BLIS_MINUS_ONE; extern obj_t BLIS_MINUS_TWO; -extern thrinfo_t BLIS_SINGLE_THREADED; +extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; +extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif From 8d8f4352a41926bc923e47be836365b6b726aff2 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 10 Mar 2014 15:47:28 -0500 Subject: [PATCH 06/42] Added single threaded thread info data structures specifically for gemm and packm --- frame/1m/packm/bli_packm_blk_var1.c | 2 +- frame/1m/packm/bli_packm_blk_var3.c | 2 +- frame/1m/packm/bli_packm_blk_var4.c | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 14 +++++++------- frame/3/gemm/bli_gemm_ker_var2.h | 2 +- frame/base/bli_init.c | 7 ++++--- frame/include/bli_extern_defs.h | 5 +++-- 7 files changed, 18 insertions(+), 16 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 5a6792c33..91cb8b335 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -274,7 +274,7 @@ void PASTEMAC(ch,varname )( \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*ps_p; \ + p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index 302f97bc8..eb828543b 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -98,7 +98,7 @@ void bli_packm_blk_var3( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( c, p, &BLIS_PACKM_SINGLE_THREADED ); return; } diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index 2e83d9b1e..dd1cedfc8 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -98,7 +98,7 @@ void bli_packm_blk_var4( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( c, p, &BLIS_PACKM_SINGLE_THREADED ); return; } diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 17030550b..7a71e8f31 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -46,7 +46,7 @@ typedef void (*FUNCPTR_T)( void* beta, void* c, inc_t rs_c, inc_t cs_c, void* gemm_ukr, - thrinfo_t* thread + gemm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); @@ -137,7 +137,7 @@ void PASTEMAC(ch,varname)( \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemm_ukr, \ - thrinfo_t* thread \ + gemm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -217,11 +217,11 @@ void PASTEMAC(ch,varname)( \ bli_auxinfo_set_ps_a( ps_a, aux ); \ bli_auxinfo_set_ps_b( ps_b, aux ); \ \ - thrinfo_t* caucus = thread_sub_caucus( thread ); \ - dim_t l2_num_threads = thread_num_caucuses( thread ); \ - dim_t l2_thread_id = thread_caucus_id( thread ); \ - dim_t l1_num_threads = thread_num_caucuses( caucus ); \ - dim_t l1_thread_id = thread_caucus_id( caucus ); \ + gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \ + dim_t l2_num_threads = thread_n_way( thread ); \ + dim_t l2_thread_id = thread_work_id( thread ); \ + dim_t l1_num_threads = thread_n_way( caucus ); \ + dim_t l1_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = l2_thread_id; j < n_iter; j += l2_num_threads ) \ diff --git a/frame/3/gemm/bli_gemm_ker_var2.h b/frame/3/gemm/bli_gemm_ker_var2.h index 1fa80e528..ca5ac1eff 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.h +++ b/frame/3/gemm/bli_gemm_ker_var2.h @@ -59,7 +59,7 @@ void PASTEMAC(ch,varname)( \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemm_ukr, \ - thrinfo_t* thread \ + gemm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( gemm_ker_var2 ) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index a72eb2da3..116ccd971 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -48,7 +48,6 @@ obj_t BLIS_MINUS_TWO; packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; - thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) @@ -62,8 +61,10 @@ void bli_init( void ) bli_error_msgs_init(); bli_mem_init(); - - //bli_setup_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED, &BLIS_SINGLE_COMM ); + + bli_setup_communicator( &BLIS_SINGLE_COMM, 1 ); + bli_setup_packm_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED ); + bli_setup_gemm_single_threaded_info( &BLIS_GEMM_SINGLE_THREADED ); } void bli_finalize( void ) diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index e916a17bd..ad58e7192 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -43,7 +43,8 @@ extern obj_t BLIS_MINUS_ONE_HALF; extern obj_t BLIS_MINUS_ONE; extern obj_t BLIS_MINUS_TWO; -extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; -extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +extern thread_comm_t BLIS_SINGLE_COMM; +extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; +extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif From 020f80c30289d8bcaa688bf600b01fae9b23b54f Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 11 Mar 2014 12:08:17 -0500 Subject: [PATCH 07/42] Added files specific to threading for gemm and packm operations --- frame/1m/packm/bli_packm_threading.c | 57 +++++++++ frame/1m/packm/bli_packm_threading.h | 51 ++++++++ frame/3/gemm/bli_gemm_threading.c | 170 +++++++++++++++++++++++++++ frame/3/gemm/bli_gemm_threading.h | 73 ++++++++++++ 4 files changed, 351 insertions(+) create mode 100644 frame/1m/packm/bli_packm_threading.c create mode 100644 frame/1m/packm/bli_packm_threading.h create mode 100644 frame/3/gemm/bli_gemm_threading.c create mode 100644 frame/3/gemm/bli_gemm_threading.h diff --git a/frame/1m/packm/bli_packm_threading.c b/frame/1m/packm/bli_packm_threading.c new file mode 100644 index 000000000..0fa6b0bf2 --- /dev/null +++ b/frame/1m/packm/bli_packm_threading.c @@ -0,0 +1,57 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + return (packm_thrinfo_t*) bli_create_thread_info( ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); +} + +void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ) +{ + bli_setup_thread_info( (thrinfo_t*) thread, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); +} + +void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; +} diff --git a/frame/1m/packm/bli_packm_threading.h b/frame/1m/packm/bli_packm_threading.h new file mode 100644 index 000000000..12be0c9cd --- /dev/null +++ b/frame/1m/packm/bli_packm_threading.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +struct packm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on +}; +typedef struct packm_thrinfo_s packm_thrinfo_t; + +packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread ); diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c new file mode 100644 index 000000000..53511fcc0 --- /dev/null +++ b/frame/3/gemm/bli_gemm_threading.c @@ -0,0 +1,170 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_gemm = sub_gemm; +} + +void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_gemm = thread; +} + +gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ) +{ + gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc( sizeof( gemm_thrinfo_t ) ); + bli_setup_gemm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_gemm ); + return thread; +} + +gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_levels ) +{ + + assert(n_levels == 5); + + dim_t jc_way = threads_at_level[0]; + dim_t kc_way = threads_at_level[1]; + dim_t ic_way = threads_at_level[2]; + dim_t jr_way = threads_at_level[3]; + dim_t ir_way = threads_at_level[4]; + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + gemm_thrinfo_t* paths = (gemm_thrinfo_t*) malloc( global_num_threads * sizeof( gemm_thrinfo_t ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_nt; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_nt; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_nt; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_nt; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < jc_nt; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + gemm_thrinfo_t* ir_info = bli_create_gemm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + gemm_thrinfo_t* jr_info = bli_create_gemm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + gemm_thrinfo_t* ic_info = bli_create_gemm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + gemm_thrinfo_t* kc_info = bli_create_gemm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + gemm_thrinfo_t* jc_info = &paths[global_comm_id]; + bli_setup_gemm_thrinfo_node( jc_info, global_comm, global_comm_id, + jr_comm, jr_comm_id, + jr_way, a, + NULL, NULL, kc_info); + } + } + } + } + } + return paths; +} diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h new file mode 100644 index 000000000..784a4b9ef --- /dev/null +++ b/frame/3/gemm/bli_gemm_threading.h @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct gemm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct gemm_thrinfo_s* sub_gemm; +}; +typedef struct gemm_thrinfo_s gemm_thrinfo_t; + +#define gemm_thread_sub_gemm( thread ) thread->sub_gemm +#define gemm_thread_sub_opackm( thread ) thread->opackm +#define gemm_thread_sub_ipackm( thread ) thread->ipackm + +gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_levels ); + +void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ); + +gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + gemm_thrinfo_t* sub_gemm ); + +void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread ); From 92233cf64274b27b2217c5cfffe75443ff6137a4 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 11 Mar 2014 14:16:08 -0500 Subject: [PATCH 08/42] Some fixes to gemm thread info tree creation, Changed microkernel tests to use the new BLIS_PACKM_SINGLE_THREADED instead of BLIS_SINGLE_THREADED --- frame/1m/packm/bli_packm_blk_var1.c | 2 +- frame/3/gemm/bli_gemm_cntl.c | 2 +- frame/3/gemm/bli_gemm_threading.c | 15 ++++++++------- frame/base/bli_threading.c | 1 + testsuite/src/test_gemm_ukr.c | 4 ++-- testsuite/src/test_gemmtrsm_ukr.c | 6 +++--- testsuite/src/test_trsm_ukr.c | 4 ++-- 7 files changed, 18 insertions(+), 16 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 91cb8b335..d8c84425b 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -265,7 +265,7 @@ void PASTEMAC(ch,varname )( \ p_begin = p_cast; \ dim_t t_id = thread_id( thread ); \ dim_t num_threads = thread_num_threads( thread ); \ -\ + p_inc = ps_p; \ \ for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 5c109f2dc..d10c2daf6 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,7 +55,7 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; -dim_t gemm_caucuses_at_level[5] = {2, 1, 1, 1, 1}; +dim_t gemm_caucuses_at_level[5] = {1, 1, 2, 1, 1}; gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() { diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 53511fcc0..5c0a337ff 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -86,7 +86,6 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_levels ) { - assert(n_levels == 5); dim_t jc_way = threads_at_level[0]; @@ -96,31 +95,33 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_ dim_t ir_way = threads_at_level[4]; dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; dim_t kc_nt = ic_way * jr_way * ir_way; dim_t ic_nt = jr_way * ir_way; dim_t jr_nt = ir_way; dim_t ir_nt = 1; + gemm_thrinfo_t* paths = (gemm_thrinfo_t*) malloc( global_num_threads * sizeof( gemm_thrinfo_t ) ); thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); - for( int a = 0; a < jc_nt; a++ ) + for( int a = 0; a < jc_way; a++ ) { thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); - for( int b = 0; b < kc_nt; b++ ) + for( int b = 0; b < kc_way; b++ ) { thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); - for( int c = 0; c < ic_nt; c++ ) + for( int c = 0; c < ic_way; c++ ) { thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); - for( int d = 0; d < jr_nt; d++ ) + for( int d = 0; d < jr_way; d++ ) { thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); - for( int e = 0; e < jc_nt; e++) + for( int e = 0; e < ir_way; e++) { thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); - dim_t ir_comm_id = 0; dim_t jr_comm_id = e*ir_nt + ir_comm_id; dim_t ic_comm_id = d*jr_nt + jr_comm_id; diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 9d70a1900..f830ebc2d 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -214,6 +214,7 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) { thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; dim_t work_id = thread->work_id; dim_t n_pt = size / n_way; diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index fc73eea86..7dbae77e2 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -221,8 +221,8 @@ void libblis_test_gemm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index f40d54eb5..f1e05572f 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -251,10 +251,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); // Create subpartitions from the a and b panels. @@ -268,7 +268,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 2262b0a0d..86764c7d7 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -217,14 +217,14 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params, &b, &bp ); // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &a, &ap, &BLIS_PACKM_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &BLIS_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c ); From c720b141568d1f289146bf34ded08001f2c0dfbb Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 17 Mar 2014 11:39:32 -0500 Subject: [PATCH 09/42] Switched to using environment variables to control threading. The environment variables all follow the format BLIS_X_NT, where X is the index of the loop as described in our paper Anatomy of High Performance Many-Threaded Matrix Multiplication. These indices are IR, JR, IC, KC, and JC. Also enabled parallelism for hemm and symm, but these are currently untested. --- frame/3/gemm/bli_gemm_cntl.c | 4 +--- frame/3/gemm/bli_gemm_front.c | 2 ++ frame/3/gemm/bli_gemm_threading.c | 23 ++++++++++++++++------- frame/3/gemm/bli_gemm_threading.h | 2 +- frame/3/hemm/bli_hemm_front.c | 27 +++++++++++++++++++-------- frame/3/symm/bli_symm_front.c | 25 ++++++++++++++++++------- 6 files changed, 57 insertions(+), 26 deletions(-) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index d10c2daf6..753182a8f 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,11 +55,9 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; -dim_t gemm_caucuses_at_level[5] = {1, 1, 2, 1, 1}; - gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() { - return bli_create_gemm_thrinfo_paths( gemm_caucuses_at_level, 5 ); + return bli_create_gemm_thrinfo_paths( ); } void bli_gemm_cntl_free_thrinfos(thrinfo_t* tofree) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index af93b6079..1c26681af 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -90,5 +90,7 @@ void bli_gemm_front( obj_t* alpha, cntl, &infos[omp_id] ); } + + bli_gemm_cntl_free_thrinfos( infos ); } diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 5c0a337ff..15c3aa84b 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -84,15 +84,24 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_levels ) +dim_t read_env( char* env ) { - assert(n_levels == 5); + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} - dim_t jc_way = threads_at_level[0]; - dim_t kc_way = threads_at_level[1]; - dim_t ic_way = threads_at_level[2]; - dim_t jr_way = threads_at_level[3]; - dim_t ir_way = threads_at_level[4]; +gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) +{ + dim_t jc_way = read_env( "BLIS_JC_NT" ); + dim_t kc_way = read_env( "BLIS_KC_NT" ); + dim_t ic_way = read_env( "BLIS_IC_NT" ); + dim_t jr_way = read_env( "BLIS_JR_NT" ); + dim_t ir_way = read_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index 784a4b9ef..d046608da 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -53,7 +53,7 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_opackm( thread ) thread->opackm #define gemm_thread_sub_ipackm( thread ) thread->ipackm -gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( dim_t* threads_at_level, dim_t n_levels ); +gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ); void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index a99869dd2..4613857b8 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -80,13 +80,24 @@ void bli_hemm_front( side_t side, bli_obj_swap( a_local, b_local ); } - // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl, - &BLIS_GEMM_SINGLE_THREADED ); + gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); + dim_t n_threads = thread_num_threads( (&infos[0]) ); + + // Invoke the internal back-end. + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + // Invoke the internal back-end. + bli_gemm_int( alpha, + &a_local, + &b_local, + beta, + &c_local, + cntl, + &infos[omp_id] ); + } + + bli_gemm_cntl_free_thrinfos( infos ); } diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 5043f1355..abc7930a3 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -79,13 +79,24 @@ void bli_symm_front( side_t side, bli_obj_swap( a_local, b_local ); } + gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); + dim_t n_threads = thread_num_threads( (&infos[0]) ); + // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl, - &BLIS_GEMM_SINGLE_THREADED ); + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + + bli_gemm_int( alpha, + &a_local, + &b_local, + beta, + &c_local, + cntl, + &infos[omp_id] ); + } + + bli_gemm_cntl_free_thrinfos( infos ); } From c51d0110831eb89361b4720bf7ed75edbd26ebce Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 17 Mar 2014 15:00:47 -0500 Subject: [PATCH 10/42] Initial multithreading support for HERK --- frame/3/gemm/bli_gemm_blk_var2f.c | 4 +- frame/3/gemm/bli_gemm_blk_var3f.c | 18 +-- frame/3/gemm/bli_gemm_cntl.c | 5 - frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 12 +- frame/3/gemm/bli_gemm_threading.c | 4 + frame/3/gemm/bli_gemm_threading.h | 1 + frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/her2k/bli_her2k_front.c | 6 +- frame/3/herk/bli_herk_blk_var1f.c | 112 +++++++++++------- frame/3/herk/bli_herk_blk_var1f.h | 3 +- frame/3/herk/bli_herk_blk_var2f.c | 111 ++++++++++------- frame/3/herk/bli_herk_blk_var2f.h | 3 +- frame/3/herk/bli_herk_blk_var3f.c | 111 ++++++++++------- frame/3/herk/bli_herk_blk_var3f.h | 3 +- frame/3/herk/bli_herk_front.c | 24 +++- frame/3/herk/bli_herk_int.c | 9 +- frame/3/herk/bli_herk_int.h | 3 +- frame/3/herk/bli_herk_l_ker_var2.c | 35 +++--- frame/3/herk/bli_herk_l_ker_var2.h | 6 +- frame/3/herk/bli_herk_threading.c | 184 +++++++++++++++++++++++++++++ frame/3/herk/bli_herk_threading.h | 74 ++++++++++++ frame/3/herk/bli_herk_u_ker_var2.c | 35 +++--- frame/3/herk/bli_herk_u_ker_var2.h | 6 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/syr2k/bli_syr2k_front.c | 6 +- frame/3/syrk/bli_syrk_front.c | 3 +- frame/base/bli_init.c | 2 + frame/base/bli_threading.h | 1 + frame/include/bli_extern_defs.h | 1 + 30 files changed, 587 insertions(+), 201 deletions(-) create mode 100644 frame/3/herk/bli_herk_threading.c create mode 100644 frame/3/herk/bli_herk_threading.h diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index e521b769f..66f2ce70b 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -66,7 +66,7 @@ void bli_gemm_blk_var2f( obj_t* a, } a_pack = thread_obroadcast( thread, &a_pack_s ); - // Initialize all pack objects that are passed into packm_init(). + // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); @@ -129,7 +129,7 @@ void bli_gemm_blk_var2f( obj_t* a, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), - gemm_thread_sub_gemm( thread) ); + gemm_thread_sub_gemm( thread ) ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 8af9837a0..f0647ccb3 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -80,17 +80,15 @@ void bli_gemm_blk_var3f( obj_t* a, // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - dim_t start, end; - bli_get_range( thread, k_trans, 1, &start, &end ); // Partition along the k dimension. - for ( i = start; i < end; i += b_alg ) + for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. - b_alg = bli_determine_blocksize_f( i, end, b, + b_alg = bli_determine_blocksize_f( i, k_trans, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. @@ -140,18 +138,20 @@ void bli_gemm_blk_var3f( obj_t* a, } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - thread_obarrier( thread ); if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); } - if( thread_am_ochief( thread ) ) - bli_obj_release_pack( c_pack ); } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 753182a8f..fd6f92c14 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -60,11 +60,6 @@ gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() return bli_create_gemm_thrinfo_paths( ); } -void bli_gemm_cntl_free_thrinfos(thrinfo_t* tofree) -{ - //MEMORYLEAK -} - void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 1c26681af..88bc32d9a 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -91,6 +91,6 @@ void bli_gemm_front( obj_t* alpha, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 7a71e8f31..7d0734e40 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -218,13 +218,13 @@ void PASTEMAC(ch,varname)( \ bli_auxinfo_set_ps_b( ps_b, aux ); \ \ gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \ - dim_t l2_num_threads = thread_n_way( thread ); \ - dim_t l2_thread_id = thread_work_id( thread ); \ - dim_t l1_num_threads = thread_n_way( caucus ); \ - dim_t l1_thread_id = thread_work_id( caucus ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = l2_thread_id; j < n_iter; j += l2_num_threads ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -239,7 +239,7 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = l1_thread_id; i < m_iter; i += l1_num_threads ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ \ diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 15c3aa84b..b0d28c8c5 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -95,6 +95,10 @@ dim_t read_env( char* env ) return number; } +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* threads ) +{ +} + gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) { dim_t jc_way = read_env( "BLIS_JC_NT" ); diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index d046608da..280ba96ad 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -54,6 +54,7 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_ipackm( thread ) thread->ipackm gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ); +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* ); void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 4613857b8..fde8f9f70 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -98,6 +98,6 @@ void bli_hemm_front( side_t side, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index bfcd076cf..1097c338c 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -115,14 +115,16 @@ void bli_her2k_front( obj_t* alpha, &bh_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); bli_herk_int( &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); #endif } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index fcb8afbb7..88671b99f 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -37,43 +37,58 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah_pack; - obj_t c1, c1_pack; + obj_t ah_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack; + obj_t* c1_pack; + obj_t* ah_pack; dim_t i; dim_t b_alg; dim_t m_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A'. + bli_obj_init_pack( &ah_pack_s ); + bli_packm_init( ah, &ah_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + ah_pack = thread_obroadcast( thread, &ah_pack_s ); + + // Initialize pack objects that are passed into packm_init() for A and C. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A' (if instructed). + bli_packm_int( ah, ah_pack, + cntl_sub_packm_b( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A'. - bli_packm_init( ah, &ah_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack A' (if instructed). - bli_packm_int( ah, &ah_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, m_trans, 8, &start, &end ); // Partition along the m dimension. - for ( i = 0; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -83,38 +98,53 @@ void bli_herk_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah_pack, + a1_pack, + ah_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_herk( cntl ) ); + c1_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( ah_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var1f.h b/frame/3/herk/bli_herk_blk_var1f.h index dfcae5c99..2a1b85f6e 100644 --- a/frame/3/herk/bli_herk_blk_var1f.h +++ b/frame/3/herk/bli_herk_blk_var1f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index e09b48810..a92888288 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -37,50 +37,66 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a_pack, aS_pack; - obj_t ah1, ah1_pack; - obj_t c1; - obj_t c1S, c1S_pack; + obj_t a_pack_s; + obj_t ah1_pack_s, c1S_pack_s; + + obj_t ah1, c1, c1S; + obj_t aS_pack; + obj_t* a_pack; + obj_t* ah1_pack; + obj_t* c1S_pack; dim_t i; dim_t b_alg; dim_t n_trans; subpart_t stored_part; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c1S_pack ); - // The upper and lower variants are identical, except for which // merged subpartition is acquired in the loop body. if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B; else stored_part = BLIS_SUBPART1T; - // Query dimension in partitioning direction. - n_trans = bli_obj_width_after_trans( *c ); + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); + // Initialize pack objects for C and A' that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &ah1_pack_s ); + bli_obj_init_pack( &c1S_pack_s ); + } + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + c1S_pack = thread_ibroadcast( thread, &c1S_pack_s ); // Pack A (if instructed). bli_packm_int( a, &a_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + hemm_thread_sub_opackm( thread ) ); + + // Query dimension in partitioning direction. + n_trans = bli_obj_width_after_trans( *c ); + dim_t start, end; + + // Needs to be replaced with a weighted range because triangle + bli_get_range( thread, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1' and C1. @@ -91,44 +107,55 @@ void bli_herk_blk_var2f( obj_t* a, // Partition off the stored region of C1 and the corresponding region // of A_pack. - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &c1, &c1S ); - bli_acquire_mpart_t2b( stored_part, - i, b_alg, &a_pack, &aS_pack ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, &c1, &c1S ); + bli_acquire_mpart_t2b( stored_part, + i, b_alg, a_pack, &aS_pack ); // Initialize objects for packing A1' and C1. - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1S, &c1S_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1S, c1S_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ) ; // Pack A1' (if instructed). - bli_packm_int( &ah1, &ah1_pack, + bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1S, &c1S_pack, + bli_packm_int( &c1S, c1S_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ) ; // Perform herk subproblem. bli_herk_int( &BLIS_ONE, &aS_pack, - &ah1_pack, + ah1_pack, &BLIS_ONE, - &c1S_pack, - cntl_sub_herk( cntl ) ); + c1S_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1S_pack, &c1S, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1S_pack, &c1S, + cntl_sub_unpackm_c( cntl ) ); + } + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c1S_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( ah1_pack ); + bli_obj_release_pack( c1S_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var2f.h b/frame/3/herk/bli_herk_blk_var2f.h index 4932535d1..1d405f214 100644 --- a/frame/3/herk/bli_herk_blk_var2f.h +++ b/frame/3/herk/bli_herk_blk_var2f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index cb3b323e2..61ca8c7de 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -37,38 +37,50 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t ah1, ah1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, ah1_pack_s; + + obj_t a1, ah1; + obj_t* a1_pack = NULL; + obj_t* ah1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing C. + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &ah1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &ah1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,28 +95,22 @@ void bli_herk_blk_var3f( obj_t* a, i, b_alg, ah, &ah1 ); // Initialize objects for packing A1 and A1'. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, &ah1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &ah1, ah1_pack, + cntl_sub_packm_b( cntl ) ); + } // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + herk_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &ah1, &ah1_pack, + bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - &a1_pack, - &ah1_pack, - &BLIS_ONE, - &c_pack, - cntl_sub_herk( cntl ) ); + herk_thread_sub_ipackm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it @@ -112,17 +118,36 @@ void bli_herk_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) bli_obj_scalar_reset( &c_pack ); + if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + + // Packing must be done before computation + thread_ibarrier( thread ); + + // Perform herk subproblem. + bli_herk_int( &BLIS_ONE, + a1_pack, + ah1_pack, + &BLIS_ONE, + c_pack, + cntl_sub_herk( cntl ), + herk_thread_sub_herk( thread ) ); + } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &ah1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( ah1_pack ); + } } diff --git a/frame/3/herk/bli_herk_blk_var3f.h b/frame/3/herk/bli_herk_blk_var3f.h index b77ebc33f..22093d421 100644 --- a/frame/3/herk/bli_herk_blk_var3f.h +++ b/frame/3/herk/bli_herk_blk_var3f.h @@ -35,5 +35,6 @@ void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 456cf84a8..ff6a18252 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,12 +77,24 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } + herk_thrinfo_t* infos = bli_herk_cntl_get_thrinfos(); + dim_t n_threads = thread_num_threads( (&infos[0]) ); + // Invoke the internal back-end. - bli_herk_int( alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntl ); + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + + bli_herk_int( alpha, + &a_local, + &ah_local, + beta, + &c_local, + cntl, + &infos[omp_id] ); + } + + bli_herk_thrinfo_free_paths( infos ); } diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index bc6a1fa5f..64fd7b1c4 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* ah, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); static FUNCPTR_T vars[2][4][3] = { @@ -66,7 +67,8 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { obj_t a_local; obj_t ah_local; @@ -138,6 +140,7 @@ void bli_herk_int( obj_t* alpha, f( &a_local, &ah_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/bli_herk_int.h index 1b1973b3e..a3fa6343d 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/bli_herk_int.h @@ -37,5 +37,6 @@ void bli_herk_int( obj_t* alpha, obj_t* ah, obj_t* beta, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 8afcf5124..c4d46718b 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_l_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_l_ker_var2.h b/frame/3/herk/bli_herk_l_ker_var2.h index 5dd906db9..09f1c7b31 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.h +++ b/frame/3/herk/bli_herk_l_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c new file mode 100644 index 000000000..ca652f196 --- /dev/null +++ b/frame/3/herk/bli_herk_threading.c @@ -0,0 +1,184 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_herk = sub_herk; +} + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_herk = thread; +} + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ) +{ + herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) ); + bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_herk ); + return thread; +} + +dim_t read_env( char* env ) +{ + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} + +void bli_herk_thrinfo_free_paths( herk_thrinfo_t* threads ) +{ +} + +herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) +{ + dim_t jc_way = read_env( "BLIS_JC_NT" ); + dim_t kc_way = read_env( "BLIS_KC_NT" ); + dim_t ic_way = read_env( "BLIS_IC_NT" ); + dim_t jr_way = read_env( "BLIS_JR_NT" ); + dim_t ir_way = read_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + herk_thrinfo_t* paths = (herk_thrinfo_t*) malloc( global_num_threads * sizeof( herk_thrinfo_t ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + herk_thrinfo_t* jc_info = &paths[global_comm_id]; + bli_setup_herk_thrinfo_node( jc_info, global_comm, global_comm_id, + jr_comm, jr_comm_id, + jr_way, a, + NULL, NULL, kc_info); + } + } + } + } + } + return paths; +} diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h new file mode 100644 index 000000000..f0e206cc7 --- /dev/null +++ b/frame/3/herk/bli_herk_threading.h @@ -0,0 +1,74 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct herk_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct herk_thrinfo_s* sub_herk; +}; +typedef struct herk_thrinfo_s herk_thrinfo_t; + +#define herk_thread_sub_herk( thread ) thread->sub_herk +#define herk_thread_sub_opackm( thread ) thread->opackm +#define herk_thread_sub_ipackm( thread ) thread->ipackm + +herk_thrinfo_t* bli_herk_create_thrinfo_paths( ); +void bli_herk_thrinfo_free_paths(); + +void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + herk_thrinfo_t* sub_herk ); + +void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 9c3d6cf06..573738c0f 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + herk_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ) + herk_t* cntl, + herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -121,7 +123,8 @@ void bli_herk_u_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \ + dim_t jr_num_threads = thread_n_way( thread ); \ + dim_t jr_thread_id = thread_work_id( thread ); \ + dim_t ir_num_threads = thread_n_way( caucus ); \ + dim_t ir_thread_id = thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ + for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ - a1 = a_cast; \ - c11 = c1; \ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ @@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ } \ -\ - b1 += cstep_b; \ - c1 += cstep_c; \ } \ } diff --git a/frame/3/herk/bli_herk_u_ker_var2.h b/frame/3/herk/bli_herk_u_ker_var2.h index c6555bc27..481947b8e 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.h +++ b/frame/3/herk/bli_herk_u_ker_var2.h @@ -39,7 +39,8 @@ void bli_herk_u_ker_var2( obj_t* a, obj_t* b, obj_t* c, - herk_t* cntl ); + herk_t* cntl, + herk_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + herk_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index abc7930a3..99c628c88 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -97,6 +97,6 @@ void bli_symm_front( side_t side, &infos[omp_id] ); } - bli_gemm_cntl_free_thrinfos( infos ); + bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 4fa89654b..ab2d0d700 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -98,14 +98,16 @@ void bli_syr2k_front( obj_t* alpha, &bt_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); bli_herk_int( alpha, &b_local, &at_local, &BLIS_ONE, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); #endif } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index cc2f8d15a..9022c9442 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -79,6 +79,7 @@ void bli_syrk_front( obj_t* alpha, &at_local, beta, &c_local, - cntl ); + cntl, + &BLIS_HERK_SINGLE_THREADED ); } diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 116ccd971..80eadd8e2 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -48,6 +48,7 @@ obj_t BLIS_MINUS_TWO; packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; thread_comm_t BLIS_SINGLE_COMM; void bli_init( void ) @@ -65,6 +66,7 @@ void bli_init( void ) bli_setup_communicator( &BLIS_SINGLE_COMM, 1 ); bli_setup_packm_single_threaded_info( &BLIS_PACKM_SINGLE_THREADED ); bli_setup_gemm_single_threaded_info( &BLIS_GEMM_SINGLE_THREADED ); + bli_setup_herk_single_threaded_info( &BLIS_HERK_SINGLE_THREADED ); } void bli_finalize( void ) diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index 9fbfcf21b..b944457b5 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -97,5 +97,6 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_packm_threading.h" #include "bli_gemm_threading.h" +#include "bli_herk_threading.h" #endif diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index ad58e7192..7b2a2dfd4 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -46,5 +46,6 @@ extern obj_t BLIS_MINUS_TWO; extern thread_comm_t BLIS_SINGLE_COMM; extern packm_thrinfo_t BLIS_PACKM_SINGLE_THREADED; extern gemm_thrinfo_t BLIS_GEMM_SINGLE_THREADED; +extern herk_thrinfo_t BLIS_HERK_SINGLE_THREADED; #endif From 5296f58975f7d351f88909cc80b6d0cffd73def7 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 17 Mar 2014 17:15:35 -0500 Subject: [PATCH 11/42] Fixing some bugs with herk parallelization --- frame/3/gemm/bli_gemm_threading.c | 4 ++-- frame/3/herk/bli_herk_blk_var2f.c | 4 ++-- frame/3/herk/bli_herk_blk_var3f.c | 1 + frame/3/herk/bli_herk_front.c | 2 +- frame/3/herk/bli_herk_threading.c | 15 ++------------- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index b0d28c8c5..627df7f9a 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -172,8 +172,8 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) gemm_thrinfo_t* jc_info = &paths[global_comm_id]; bli_setup_gemm_thrinfo_node( jc_info, global_comm, global_comm_id, - jr_comm, jr_comm_id, - jr_way, a, + jc_comm, jc_comm_id, + jc_way, a, NULL, NULL, kc_info); } } diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index a92888288..a1fba63f4 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -81,9 +81,9 @@ void bli_herk_blk_var2f( obj_t* a, c1S_pack = thread_ibroadcast( thread, &c1S_pack_s ); // Pack A (if instructed). - bli_packm_int( a, &a_pack, + bli_packm_int( a, a_pack, cntl_sub_packm_a( cntl ), - hemm_thread_sub_opackm( thread ) ); + herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *c ); diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 61ca8c7de..78e3cd30e 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -101,6 +101,7 @@ void bli_herk_blk_var3f( obj_t* a, bli_packm_init( &ah1, ah1_pack, cntl_sub_packm_b( cntl ) ); } + thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index ff6a18252..19a033a57 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,7 +77,7 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - herk_thrinfo_t* infos = bli_herk_cntl_get_thrinfos(); + herk_thrinfo_t* infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( (&infos[0]) ); // Invoke the internal back-end. diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index ca652f196..ec6c9d31c 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -84,17 +84,6 @@ herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -dim_t read_env( char* env ) -{ - dim_t number = 1; - char* str = getenv( env ); - if( str != NULL ) - { - number = strtol( str, NULL, 10 ); - } - return number; -} - void bli_herk_thrinfo_free_paths( herk_thrinfo_t* threads ) { } @@ -172,8 +161,8 @@ herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) herk_thrinfo_t* jc_info = &paths[global_comm_id]; bli_setup_herk_thrinfo_node( jc_info, global_comm, global_comm_id, - jr_comm, jr_comm_id, - jr_way, a, + jc_comm, jc_comm_id, + jc_way, a, NULL, NULL, kc_info); } } From 0ac534cdf657bbf04601abfe719ba2887aab5da7 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 18 Mar 2014 13:26:27 -0500 Subject: [PATCH 12/42] Added decorator for calling parallelized intermal functions Will allow for easy support for different threading models --- frame/3/gemm/bli_gemm_cntl.c | 5 ---- frame/3/gemm/bli_gemm_cntl.h | 1 - frame/3/gemm/bli_gemm_front.c | 25 ++++++++---------- frame/3/gemm/bli_gemm_threading.c | 16 ++++++------ frame/3/gemm/bli_gemm_threading.h | 4 +-- frame/3/hemm/bli_hemm_front.c | 26 ++++++++----------- frame/3/her2k/bli_her2k_front.c | 42 ++++++++++++++++++++----------- frame/3/herk/bli_herk_front.c | 28 +++++++++------------ frame/3/herk/bli_herk_threading.c | 17 +++++++------ frame/3/herk/bli_herk_threading.h | 4 +-- frame/3/symm/bli_symm_front.c | 30 ++++++++++------------ frame/3/syr2k/bli_syr2k_front.c | 38 +++++++++++++++++----------- frame/3/syrk/bli_syrk_front.c | 23 +++++++++++------ frame/base/bli_threading.c | 32 +++++++++++++++++++++++ frame/base/bli_threading.h | 11 ++++++++ 15 files changed, 177 insertions(+), 125 deletions(-) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index fd6f92c14..2fccb5fc7 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -55,11 +55,6 @@ gemm_t* gemm_cntl_vl_mm; gemm_t* gemm_cntl; -gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos() -{ - return bli_create_gemm_thrinfo_paths( ); -} - void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 136f89ef5..882b746eb 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -65,4 +65,3 @@ gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, gemm_t* sub_gemm, unpackm_t* sub_unpack_c ); -gemm_thrinfo_t* bli_gemm_cntl_get_thrinfos(); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 88bc32d9a..a17a600b5 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -74,22 +74,19 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); - dim_t n_threads = thread_num_threads( (&infos[0]) ); + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. - _Pragma( "omp parallel num_threads(n_threads)" ) - { - dim_t omp_id = omp_get_thread_num(); - - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl, - &infos[omp_id] ); - } + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 627df7f9a..6d2ec5f1b 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -95,11 +95,11 @@ dim_t read_env( char* env ) return number; } -void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* threads ) +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads ) { } -gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) +gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) { dim_t jc_way = read_env( "BLIS_JC_NT" ); dim_t kc_way = read_env( "BLIS_KC_NT" ); @@ -117,7 +117,7 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) dim_t ir_nt = 1; - gemm_thrinfo_t* paths = (gemm_thrinfo_t*) malloc( global_num_threads * sizeof( gemm_thrinfo_t ) ); + gemm_thrinfo_t** paths = (gemm_thrinfo_t**) malloc( global_num_threads * sizeof( gemm_thrinfo_t* ) ); thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); for( int a = 0; a < jc_way; a++ ) @@ -170,11 +170,11 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ) kc_way, b, NULL, NULL, ic_info); - gemm_thrinfo_t* jc_info = &paths[global_comm_id]; - bli_setup_gemm_thrinfo_node( jc_info, global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - NULL, NULL, kc_info); + gemm_thrinfo_t* jc_info = bli_create_gemm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; } } } diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index 280ba96ad..54a8f4884 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -53,8 +53,8 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_opackm( thread ) thread->opackm #define gemm_thread_sub_ipackm( thread ) thread->ipackm -gemm_thrinfo_t* bli_create_gemm_thrinfo_paths( ); -void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t* ); +gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ); +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** ); void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index fde8f9f70..9d1a7ea5c 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -80,23 +80,19 @@ void bli_hemm_front( side_t side, bli_obj_swap( a_local, b_local ); } - gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); - dim_t n_threads = thread_num_threads( (&infos[0]) ); + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. - _Pragma( "omp parallel num_threads(n_threads)" ) - { - dim_t omp_id = omp_get_thread_num(); - - // Invoke the internal back-end. - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl, - &infos[omp_id] ); - } + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 1097c338c..6d019fe57 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -109,22 +109,34 @@ void bli_her2k_front( obj_t* alpha, &c_local, cntl ); #else - // Invoke herk twice, using beta only the first time. - bli_herk_int( alpha, - &a_local, - &bh_local, - beta, - &c_local, - cntl, - &BLIS_HERK_SINGLE_THREADED ); - bli_herk_int( &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - cntl, - &BLIS_HERK_SINGLE_THREADED ); + // Invoke herk twice, using beta only the first time. + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + alpha, + &a_local, + &bh_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + &alpha_conj, + &b_local, + &ah_local, + &BLIS_ONE, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos ); + #endif } diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 19a033a57..33c36fd3b 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,24 +77,20 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - herk_thrinfo_t* infos = bli_create_herk_thrinfo_paths(); - dim_t n_threads = thread_num_threads( (&infos[0]) ); + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); - // Invoke the internal back-end. - _Pragma( "omp parallel num_threads(n_threads)" ) - { - dim_t omp_id = omp_get_thread_num(); + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + alpha, + &a_local, + &ah_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); - - bli_herk_int( alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntl, - &infos[omp_id] ); - } - bli_herk_thrinfo_free_paths( infos ); } diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index ec6c9d31c..942014883 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -84,11 +84,11 @@ herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -void bli_herk_thrinfo_free_paths( herk_thrinfo_t* threads ) +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads ) { } -herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) +herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) { dim_t jc_way = read_env( "BLIS_JC_NT" ); dim_t kc_way = read_env( "BLIS_KC_NT" ); @@ -106,7 +106,7 @@ herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) dim_t ir_nt = 1; - herk_thrinfo_t* paths = (herk_thrinfo_t*) malloc( global_num_threads * sizeof( herk_thrinfo_t ) ); + herk_thrinfo_t** paths = (herk_thrinfo_t**) malloc( global_num_threads * sizeof( herk_thrinfo_t* ) ); thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); for( int a = 0; a < jc_way; a++ ) @@ -159,11 +159,12 @@ herk_thrinfo_t* bli_create_herk_thrinfo_paths( ) kc_way, b, NULL, NULL, ic_info); - herk_thrinfo_t* jc_info = &paths[global_comm_id]; - bli_setup_herk_thrinfo_node( jc_info, global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - NULL, NULL, kc_info); + herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + + paths[global_comm_id] = jc_info; } } } diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h index f0e206cc7..05e038aab 100644 --- a/frame/3/herk/bli_herk_threading.h +++ b/frame/3/herk/bli_herk_threading.h @@ -53,8 +53,8 @@ typedef struct herk_thrinfo_s herk_thrinfo_t; #define herk_thread_sub_opackm( thread ) thread->opackm #define herk_thread_sub_ipackm( thread ) thread->ipackm -herk_thrinfo_t* bli_herk_create_thrinfo_paths( ); -void bli_herk_thrinfo_free_paths(); +herk_thrinfo_t** bli_create_herk_thrinfo_paths( ); +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths ); void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 99c628c88..cce25b4c8 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -79,23 +79,19 @@ void bli_symm_front( side_t side, bli_obj_swap( a_local, b_local ); } - gemm_thrinfo_t* infos = bli_gemm_cntl_get_thrinfos(); - dim_t n_threads = thread_num_threads( (&infos[0]) ); - - // Invoke the internal back-end. - _Pragma( "omp parallel num_threads(n_threads)" ) - { - dim_t omp_id = omp_get_thread_num(); - - - bli_gemm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl, - &infos[omp_id] ); - } + gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); bli_gemm_thrinfo_free_paths( infos ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index ab2d0d700..fb5d4f0f6 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -93,21 +93,31 @@ void bli_syr2k_front( obj_t* alpha, cntl ); #else // Invoke herk twice, using beta only the first time. - bli_herk_int( alpha, - &a_local, - &bt_local, - beta, - &c_local, - cntl, - &BLIS_HERK_SINGLE_THREADED ); + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); - bli_herk_int( alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - cntl, - &BLIS_HERK_SINGLE_THREADED ); + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + alpha, + &a_local, + &bt_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + alpha, + &b_local, + &at_local, + &BLIS_ONE, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos ); #endif } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 9022c9442..d9039cdb0 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -72,14 +72,21 @@ void bli_syrk_front( obj_t* alpha, { bli_obj_induce_trans( c_local ); } + + herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_herk_int( alpha, - &a_local, - &at_local, - beta, - &c_local, - cntl, - &BLIS_HERK_SINGLE_THREADED ); + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t*) bli_herk_int, + alpha, + &a_local, + &at_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_herk_thrinfo_free_paths( infos ); } diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index f830ebc2d..53405bd96 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -95,10 +95,14 @@ void bli_barrier( thread_comm_t* communicator, dim_t t_id ) bool_t my_sense = communicator->barrier_sense; dim_t my_threads_arrived; + _Pragma("omp atomic capture") + my_threads_arrived = communicator->barrier_threads_arrived++; +/* bli_set_lock(&communicator->barrier_lock); my_threads_arrived = communicator->barrier_threads_arrived + 1; communicator->barrier_threads_arrived = my_threads_arrived; bli_unset_lock(&communicator->barrier_lock); +*/ if( my_threads_arrived == communicator->n_threads ) { @@ -223,3 +227,31 @@ void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim *start = work_id * n_pt; *end = bli_min( *start + n_pt, size ); } + +void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) +{ +} + +void bli_level3_thread_decorator( dim_t n_threads, + level3_int_t* func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + void* cntl, + void** thread ) +{ + _Pragma( "omp parallel num_threads(n_threads)" ) + { + dim_t omp_id = omp_get_thread_num(); + + (*func) ( alpha, + a, + b, + beta, + c, + cntl, + thread[omp_id] ); + } +} diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index b944457b5..fdd3ae32a 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -99,4 +99,15 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_gemm_threading.h" #include "bli_herk_threading.h" +typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); +void bli_level3_thread_decorator( dim_t num_threads, + level3_int_t* func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + void* cntl, + void** thread ); + #endif From ec8b88f93533942d3711191873310e7ff281bda6 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 18 Mar 2014 14:35:37 -0500 Subject: [PATCH 13/42] Enabled threading for packm blocked variants 3 and 4 --- frame/1m/packm/bli_packm_blk_var3.c | 22 ++++++++++++++-------- frame/1m/packm/bli_packm_blk_var3.h | 6 ++++-- frame/1m/packm/bli_packm_blk_var4.c | 22 ++++++++++++++-------- frame/1m/packm/bli_packm_blk_var4.h | 6 ++++-- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index eb828543b..f7e60e406 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); void bli_packm_blk_var3( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -154,7 +156,8 @@ void bli_packm_blk_var3( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +180,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -296,14 +300,18 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ + p_inc = ps_p; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ + ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -437,8 +445,6 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ -\ - p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h index 6189d2415..b1d684262 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var3( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var3 ) diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index dd1cedfc8..d8721df75 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)( void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p + dim_t pd_p, inc_t ps_p, + packm_thrinfo_t* thread ); //static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4); void bli_packm_blk_var4( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -154,7 +156,8 @@ void bli_packm_blk_var4( obj_t* c, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, - pd_p, ps_p ); + pd_p, ps_p, + t ); } @@ -177,7 +180,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ @@ -296,14 +300,18 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ + dim_t t_id = thread_id( thread ); \ + dim_t num_threads = thread_num_threads( thread ); \ + p_inc = ps_p; \ \ - for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ + for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ + ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ + p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -452,8 +460,6 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ -\ - p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h index e13e5fe33..e727873e4 100644 --- a/frame/1m/packm/bli_packm_blk_var4.h +++ b/frame/1m/packm/bli_packm_blk_var4.h @@ -33,7 +33,8 @@ */ void bli_packm_blk_var4( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* t ); #undef GENTPROTCO @@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \ void* kappa, \ void* c, inc_t rs_c, inc_t cs_c, \ void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p \ + dim_t pd_p, inc_t ps_p, \ + packm_thrinfo_t* t \ ); INSERT_GENTPROTCO_BASIC( packm_blk_var4 ) From aa2405f8b23d0f8d2ec04790882f2176ef2e8fd8 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 18 Mar 2014 15:23:09 -0500 Subject: [PATCH 14/42] Fixing function pointer issues with thread decorator --- frame/3/gemm/bli_gemm_front.c | 2 +- frame/base/bli_threading.c | 14 ++++++++------ frame/base/bli_threading.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index a17a600b5..2211625a5 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -79,7 +79,7 @@ void bli_gemm_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_gemm_int, + (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 53405bd96..b1814a023 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -233,7 +233,7 @@ void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool } void bli_level3_thread_decorator( dim_t n_threads, - level3_int_t* func, + level3_int_t func, obj_t* alpha, obj_t* a, obj_t* b, @@ -242,16 +242,18 @@ void bli_level3_thread_decorator( dim_t n_threads, void* cntl, void** thread ) { - _Pragma( "omp parallel num_threads(n_threads)" ) + //_Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); + // dim_t omp_id = omp_get_thread_num(); - (*func) ( alpha, + func( alpha, +// bli_gemm_int ( alpha, a, b, beta, c, - cntl, - thread[omp_id] ); + (gemm_t*)cntl, +// thread[omp_id] ); + (gemm_thrinfo_t*)thread[0] ); } } diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index fdd3ae32a..bb2bd6ba3 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -101,7 +101,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); void bli_level3_thread_decorator( dim_t num_threads, - level3_int_t* func, + level3_int_t func, obj_t* alpha, obj_t* a, obj_t* b, From fb42983bd9943711baa7d1c6496de1215bb816ef Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 18 Mar 2014 16:37:28 -0500 Subject: [PATCH 15/42] Fixed a barrier bug and a thread decorator bug --- frame/base/bli_threading.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index b1814a023..d612210b2 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -96,7 +96,8 @@ void bli_barrier( thread_comm_t* communicator, dim_t t_id ) dim_t my_threads_arrived; _Pragma("omp atomic capture") - my_threads_arrived = communicator->barrier_threads_arrived++; + my_threads_arrived = ++(communicator->barrier_threads_arrived); + /* bli_set_lock(&communicator->barrier_lock); my_threads_arrived = communicator->barrier_threads_arrived + 1; @@ -242,18 +243,16 @@ void bli_level3_thread_decorator( dim_t n_threads, void* cntl, void** thread ) { - //_Pragma( "omp parallel num_threads(n_threads)" ) + _Pragma( "omp parallel num_threads(n_threads)" ) { - // dim_t omp_id = omp_get_thread_num(); + dim_t omp_id = omp_get_thread_num(); func( alpha, -// bli_gemm_int ( alpha, a, b, beta, c, - (gemm_t*)cntl, -// thread[omp_id] ); - (gemm_thrinfo_t*)thread[0] ); + cntl, + thread[omp_id] ); } } From c0140cb752f27e99742f85d23be2181c00a1335e Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 19 Mar 2014 11:21:16 -0500 Subject: [PATCH 16/42] Fixed packm variants 3 and 4 where every thread was trying to manipulate the same state Now just performed by the master thread. --- frame/1m/packm/bli_packm_blk_var3.c | 35 ++++++++++++++++------------- frame/1m/packm/bli_packm_blk_var4.c | 35 ++++++++++++++++------------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index f7e60e406..bf93341c6 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -100,7 +100,7 @@ void bli_packm_blk_var3( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( c, p, t ); return; } @@ -111,23 +111,26 @@ void bli_packm_blk_var3( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if ( thread_am_ochief( t ) ) { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index d8721df75..3d1ab78dc 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -100,7 +100,7 @@ void bli_packm_blk_var4( obj_t* c, // in the real domain. if ( bli_is_real( dt_cp ) ) { - bli_packm_blk_var1( c, p, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( c, p, t ); return; } @@ -111,23 +111,26 @@ void bli_packm_blk_var4( obj_t* c, // real domain counterparts. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + if( thread_am_ochief( t ) ) { + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + } + kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. From 5d5dc2eedef2f7c90d61371a1b457be5c06cf583 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 20 Mar 2014 16:43:36 -0500 Subject: [PATCH 17/42] Parallelized trmm and trmm3 Also fixed bugs in packm --- frame/1m/packm/bli_packm_blk_var1.c | 23 ++- frame/1m/packm/bli_packm_blk_var3.c | 19 ++- frame/1m/packm/bli_packm_blk_var4.c | 19 ++- frame/1m/packm/bli_packm_threading.h | 2 + frame/3/gemm/bli_gemm_blk_var1f.c | 2 +- frame/3/gemm/bli_gemm_blk_var2f.c | 2 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/her2k/bli_her2k_front.c | 4 +- frame/3/herk/bli_herk_blk_var1f.c | 2 +- frame/3/herk/bli_herk_blk_var2f.c | 2 +- frame/3/herk/bli_herk_front.c | 2 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/syr2k/bli_syr2k_front.c | 4 +- frame/3/syrk/bli_syrk_front.c | 2 +- frame/3/trmm/bli_trmm_blk_var1f.c | 111 +++++++++----- frame/3/trmm/bli_trmm_blk_var1f.h | 3 +- frame/3/trmm/bli_trmm_blk_var2b.c | 116 +++++++++----- frame/3/trmm/bli_trmm_blk_var2b.h | 3 +- frame/3/trmm/bli_trmm_blk_var2f.c | 116 +++++++++----- frame/3/trmm/bli_trmm_blk_var2f.h | 3 +- frame/3/trmm/bli_trmm_blk_var3b.c | 109 ++++++++----- frame/3/trmm/bli_trmm_blk_var3b.h | 3 +- frame/3/trmm/bli_trmm_blk_var3f.c | 109 ++++++++----- frame/3/trmm/bli_trmm_blk_var3f.h | 3 +- frame/3/trmm/bli_trmm_front.c | 22 ++- frame/3/trmm/bli_trmm_int.c | 9 +- frame/3/trmm/bli_trmm_int.h | 3 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 222 ++++++++++++++------------- frame/3/trmm/bli_trmm_ll_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 24 ++- frame/3/trmm/bli_trmm_lu_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 23 ++- frame/3/trmm/bli_trmm_rl_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 23 ++- frame/3/trmm/bli_trmm_ru_ker_var2.h | 6 +- frame/3/trmm/bli_trmm_threading.c | 173 +++++++++++++++++++++ frame/3/trmm/bli_trmm_threading.h | 79 ++++++++++ frame/3/trmm3/bli_trmm3_front.c | 22 ++- frame/base/bli_threading.c | 9 +- frame/base/bli_threading.h | 4 +- 40 files changed, 897 insertions(+), 403 deletions(-) create mode 100644 frame/3/trmm/bli_trmm_threading.c create mode 100644 frame/3/trmm/bli_trmm_threading.h diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index d8c84425b..7a5caf7de 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -263,18 +263,14 @@ void PASTEMAC(ch,varname )( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -323,6 +319,8 @@ void PASTEMAC(ch,varname )( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk)( strucc, \ diagoffp_i, \ diagc, \ @@ -336,6 +334,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + }\ \ \ p_inc = ldp * panel_len_max_i; \ @@ -349,6 +348,8 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk)( strucc, \ diagoffc_i, \ uploc, \ @@ -360,6 +361,7 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -373,6 +375,8 @@ void PASTEMAC(ch,varname )( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -384,10 +388,13 @@ void PASTEMAC(ch,varname )( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ -\ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ - } \ + } \ +\ +\ + p_begin += p_inc; \ } \ \ \ diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index bf93341c6..2d69e51d7 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ diagoffp_i, \ diagc, \ @@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -399,6 +398,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ diagoffc_i, \ uploc, \ @@ -411,6 +412,7 @@ void PASTEMAC(ch,varname)( \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ \ + } \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ } \ @@ -423,6 +425,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -434,6 +438,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ @@ -448,6 +453,8 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ +\ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c index 3d1ab78dc..8cfd49afa 100644 --- a/frame/1m/packm/bli_packm_blk_var4.c +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -303,18 +303,14 @@ void PASTEMAC(ch,varname)( \ } \ \ p_begin = p_cast; \ - dim_t t_id = thread_id( thread ); \ - dim_t num_threads = thread_num_threads( thread ); \ - p_inc = ps_p; \ \ - for ( ic = ic0 + t_id * ic_inc, ip = ip0 + t_id * ip_inc, it = t_id; it < num_iter; \ - ic += num_threads * ic_inc, ip += num_threads * ip_inc, it += num_threads ) \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ c_begin = c_cast + (ic )*vs_c; \ - p_begin = p_cast + (ip )*p_inc; \ \ if ( bli_is_triangular( strucc ) && \ bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ @@ -363,6 +359,8 @@ void PASTEMAC(ch,varname)( \ c_use = c_begin + (panel_off_i )*ldc; \ p_use = p_begin; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ diagoffp_i, \ diagc, \ @@ -376,6 +374,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_use, rs_c, cs_c, \ p_use, rs_p, cs_p ); \ + } \ \ p_inc = ldp * panel_len_max_i; \ \ @@ -406,6 +405,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ diagoffc_i, \ uploc, \ @@ -417,6 +418,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -430,6 +432,8 @@ void PASTEMAC(ch,varname)( \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ + if( packm_thread_my_iter( it, thread ) ) \ + { \ PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ 0, \ BLIS_DENSE, \ @@ -441,6 +445,7 @@ void PASTEMAC(ch,varname)( \ kappa_cast, \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ + } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ @@ -463,6 +468,8 @@ void PASTEMAC(ch,varname)( \ */ \ \ } \ +\ + p_begin += p_inc; \ } \ } diff --git a/frame/1m/packm/bli_packm_threading.h b/frame/1m/packm/bli_packm_threading.h index 12be0c9cd..0d6fce2e4 100644 --- a/frame/1m/packm/bli_packm_threading.h +++ b/frame/1m/packm/bli_packm_threading.h @@ -44,6 +44,8 @@ struct packm_thrinfo_s //implements thrinfo_t }; typedef struct packm_thrinfo_s packm_thrinfo_t; +#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 368c303cf..c3e5db6c0 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; - bli_get_range( thread, m_trans, 8, &start, &end ); + bli_get_range( thread, 0, m_trans, 8, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index 66f2ce70b..82aad8b3d 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, n_trans, 8, &start, &end ); + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 9d1a7ea5c..c3a708211 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -85,7 +85,7 @@ void bli_hemm_front( side_t side, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_gemm_int, + (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 6d019fe57..b8329cf5b 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -116,7 +116,7 @@ void bli_her2k_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &bh_local, @@ -126,7 +126,7 @@ void bli_her2k_front( obj_t* alpha, (void**) infos ); bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, &alpha_conj, &b_local, &ah_local, diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 88671b99f..899aa194c 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range( thread, m_trans, 8, &start, &end ); + bli_get_range( thread, 0, m_trans, 8, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index a1fba63f4..3ef777247 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -90,7 +90,7 @@ void bli_herk_blk_var2f( obj_t* a, dim_t start, end; // Needs to be replaced with a weighted range because triangle - bli_get_range( thread, n_trans, 8, &start, &end ); + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 33c36fd3b..6139478ea 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -82,7 +82,7 @@ void bli_herk_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &ah_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index cce25b4c8..ed0c44664 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -84,7 +84,7 @@ void bli_symm_front( side_t side, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_gemm_int, + (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index fb5d4f0f6..f1ce3e279 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -98,7 +98,7 @@ void bli_syr2k_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &bt_local, @@ -108,7 +108,7 @@ void bli_syr2k_front( obj_t* alpha, (void**) infos ); bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &b_local, &at_local, diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index d9039cdb0..c5ac22797 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -78,7 +78,7 @@ void bli_syrk_front( obj_t* alpha, // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, - (level3_int_t*) bli_herk_int, + (level3_int_t) bli_herk_int, alpha, &a_local, &at_local, diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 23238a089..ac1973366 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -37,21 +37,48 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1, c1_pack; + obj_t b_pack_s; + obj_t a1_pack_s, c1_pack_s; + + obj_t a1, c1; + obj_t* a1_pack = NULL; + obj_t* b_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; + if( thread_am_ochief( thread ) ) { + // Initialize object for packing B. + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + + // Scale C by beta (if instructed). + // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); - bli_obj_init_pack( &c1_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack B (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trmm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -66,25 +93,14 @@ void bli_trmm_blk_var1f( obj_t* a, m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) + bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -94,38 +110,55 @@ void bli_trmm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( b_pack ); + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.h b/frame/3/trmm/bli_trmm_blk_var1f.h index c9fc004f7..63994a9a6 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.h +++ b/frame/3/trmm/bli_trmm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 0c98da8e6..2a211bdbc 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -37,43 +37,58 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, n_trans, b, + b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -83,38 +98,55 @@ void bli_trmm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2b.h b/frame/3/trmm/bli_trmm_blk_var2b.h index e8d54ecdb..afb9f9903 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.h +++ b/frame/3/trmm/bli_trmm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 14571322b..f1ccedd45 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -37,43 +37,58 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + + if( thread_am_ochief( thread ) ) { + // Initialize object for packing A + bli_obj_init_pack( &a_pack_s ); + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, 0, n_trans, 8, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, n_trans, b, + b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. @@ -83,38 +98,55 @@ void bli_trmm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be finished before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trmm( cntl ) ); + c1_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); - // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Unpack C1 (if C1 was packed). + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var2f.h b/frame/3/trmm/bli_trmm_blk_var2f.h index 148bbd234..8c47d55b8 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.h +++ b/frame/3/trmm/bli_trmm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 11b3dc551..40e9e21d6 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -37,38 +37,50 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,38 +95,51 @@ void bli_trmm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3b.h b/frame/3/trmm/bli_trmm_blk_var3b.h index bcd4c8c4b..e3a5bfbb3 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.h +++ b/frame/3/trmm/bli_trmm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 59050423c..80293e42f 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -37,38 +37,50 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + if( thread_am_ochief( thread ) ){ + // Initialize object for packing C + bli_obj_init_pack( &c_pack_s ); + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + // Initialize pack objects for A and B that are passed into packm_init(). + if( thread_am_ichief( thread ) ){ + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,38 +95,51 @@ void bli_trmm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trmm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trmm( cntl ) ); + c_pack, + cntl_sub_trmm( cntl ), + trmm_thread_sub_trmm( thread ) ); } - // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + thread_obarrier( thread ); - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + // Unpack C (if C was packed). + if( thread_am_ochief( thread ) ){ + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } + + // If any packing buffers were acquired within packm, release them back + // to the memory manager. + if( thread_am_ichief( thread ) ){ + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trmm/bli_trmm_blk_var3f.h b/frame/3/trmm/bli_trmm_blk_var3f.h index 4be2c7b3c..6f9338cbb 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.h +++ b/frame/3/trmm/bli_trmm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trmm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 1911ba3be..644f27d4b 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -125,12 +125,20 @@ void bli_trmm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos ); } diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 287205873..56327008b 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -173,6 +175,7 @@ void bli_trmm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/bli_trmm_int.h index 18c2d0da0..70d8b551e 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/bli_trmm_int.h @@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 01fc281ee..99e0dcec7 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -270,9 +274,12 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ - { \ + for ( j = 0; j < n_iter; ++j ) { \ +\ + if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ +\ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ @@ -307,121 +314,124 @@ void PASTEMAC(ch,varname)( \ off_a1011 = 0; \ k_a1011 = diagoffa_i + MR; \ \ - b1_i = b1 + off_a1011 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * ss_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ -\ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + b1_i = b1 + off_a1011 * PACKNR; \ \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + k_a1011 * ss_a; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ + \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ + \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ + \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Copy edge elements of C to the temporary buffer. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + c11, rs_c, cs_c, \ + ct, rs_ct, cs_ct ); \ + \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + ct, rs_ct, cs_ct, \ + &aux ); \ + \ + /* Copy the result to the edge of C. */ \ + PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += k_a1011 * ss_a; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ - if ( bli_is_last_iter( i, m_iter ) ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter( j, n_iter ) ) \ - b2 = b_cast; \ - } \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( bli_is_last_iter( i, m_iter ) ) \ + { \ + a2 = a_cast; \ + b2 = b1 + cstep_b; \ + if ( bli_is_last_iter( j, n_iter ) ) \ + b2 = b_cast; \ + } \ \ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, aux ); \ - bli_auxinfo_set_next_b( b2, aux ); \ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, aux ); \ + bli_auxinfo_set_next_b( b2, aux ); \ \ - /* Save the panel stride of the current panel of A to the - auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( rstep_a, aux ); \ + /* Save the panel stride of the current panel of A to the + auxinfo_t object. */ \ + bli_auxinfo_set_ps_a( rstep_a, aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr_cast( k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Handle interior and edge cases separately. */ \ + if ( m_cur == MR && n_cur == NR ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux ); \ + } \ + else \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr_cast( k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux ); \ \ + /* Add the result to the edge of C. */ \ + PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c ); \ + } \ + } \ a1 += rstep_a; \ } \ -\ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.h b/frame/3/trmm/bli_trmm_ll_ker_var2.h index eb9cb1cc5..9710adc7c 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 867809da0..0622bbbb2 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \ \ b1 = b_cast; \ c1 = c_cast; \ +\ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ @@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ + for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ @@ -315,6 +321,7 @@ void PASTEMAC(ch,varname)( \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ @@ -369,11 +376,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += k_a1112 * ss_a; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ + if( trmm_l_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ @@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ -\ + } \ b1 += cstep_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.h b/frame/3/trmm/bli_trmm_lu_ker_var2.h index 3ba1f0ca7..508612a90 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.h +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index ae4b4b1d2..f48baf4b3 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b1121 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.h b/frame/3/trmm/bli_trmm_rl_ker_var2.h index 3059aaaa9..d1e998bf6 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.h +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 57d112ce5..d9a28f86d 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)( void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* gemm_ukr + void* gemm_ukr, + trmm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); @@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ) + trmm_t* cntl, + trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* jr_thread \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ @@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \ b1 = b_cast; \ c1 = c_cast; \ \ + trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ @@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \ so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ +\ + if( trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ @@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ @@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ @@ -378,6 +386,7 @@ void PASTEMAC(ch,varname)( \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ + if( trmm_r_ir_my_iter( i, ir_thread ) ) { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ @@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ -\ + } \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ -\ + } \ b1 += k_b0111 * ss_b; \ c1 += cstep_c; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.h b/frame/3/trmm/bli_trmm_ru_ker_var2.h index 93c22402f..cb4a7b937 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.h +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trmm_t* cntl ); + trmm_t* cntl, + trmm_thrinfo_t* thread ); // @@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trmm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c new file mode 100644 index 000000000..0a9d83da2 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trmm = sub_trmm; +} + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trmm = thread; +} + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ) +{ + trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) ); + bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trmm ); + return thread; +} + +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads ) +{ +} + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) +{ + dim_t jc_way = read_env( "BLIS_JC_NT" ); + dim_t kc_way = read_env( "BLIS_KC_NT" ); + dim_t ic_way = read_env( "BLIS_IC_NT" ); + dim_t jr_way = read_env( "BLIS_JR_NT" ); + dim_t ir_way = read_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trmm/bli_trmm_threading.h b/frame/3/trmm/bli_trmm_threading.h new file mode 100644 index 000000000..376608261 --- /dev/null +++ b/frame/3/trmm/bli_trmm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trmm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trmm_thrinfo_s* sub_trmm; +}; +typedef struct trmm_thrinfo_s trmm_thrinfo_t; + +#define trmm_thread_sub_trmm( thread ) thread->sub_trmm +#define trmm_thread_sub_opackm( thread ) thread->opackm +#define trmm_thread_sub_ipackm( thread ) thread->ipackm + +#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ); +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** ); + +void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trmm_thrinfo_t* sub_trmm ); + +void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread ); diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 1d4a68918..080b9a399 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,12 +127,20 @@ void bli_trmm3_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trmm_int( alpha, - &a_local, - &b_local, - beta, - &c_local, - cntl ); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trmm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trmm_thrinfo_free_paths( infos ); } diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index d612210b2..c0ef641ea 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -216,17 +216,18 @@ thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) return info_paths; } */ -void bli_get_range( void* thr, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ) +void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ) { thrinfo_t* thread = (thrinfo_t*) thr; - dim_t n_way = thread->n_way; dim_t work_id = thread->work_id; + + dim_t size = all_end - all_start; dim_t n_pt = size / n_way; n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt; n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); - *start = work_id * n_pt; - *end = bli_min( *start + n_pt, size ); + *start = work_id * n_pt + all_start; + *end = bli_min( *start + n_pt, size + all_start ); } void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index bb2bd6ba3..daaf2d6f4 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -87,7 +87,8 @@ typedef struct thrinfo_s thrinfo_t; #define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id ) #define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) -void bli_get_range( void* thread, dim_t size, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); + thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, @@ -98,6 +99,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_packm_threading.h" #include "bli_gemm_threading.h" #include "bli_herk_threading.h" +#include "bli_trmm_threading.h" typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); void bli_level3_thread_decorator( dim_t num_threads, From f0824a04fc75e231c3a3d7757fa4e7294173282f Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 24 Mar 2014 15:21:42 -0500 Subject: [PATCH 18/42] Initial commit to enable threading in TRSM, Also enabled weighted partitioning for herk, trmm Fixed bug where multiple threads would try to modify the same state in the internal level 3 functions Correctly computed a_next and b_next for gemm, herk macrokernels a_next and b_next point to the current micropanels in trmm --- frame/3/gemm/bli_gemm_blk_var3f.c | 16 +-- frame/3/gemm/bli_gemm_int.c | 21 +++- frame/3/gemm/bli_gemm_ker_var2.c | 4 +- frame/3/gemm/bli_gemm_threading.c | 21 +--- frame/3/gemm/bli_gemm_threading.h | 4 + frame/3/herk/bli_herk_blk_var2f.c | 3 +- frame/3/herk/bli_herk_int.c | 18 ++- frame/3/herk/bli_herk_l_ker_var2.c | 4 +- frame/3/herk/bli_herk_threading.c | 10 +- frame/3/herk/bli_herk_threading.h | 5 + frame/3/herk/bli_herk_u_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_blk_var2b.c | 3 +- frame/3/trmm/bli_trmm_blk_var2f.c | 3 +- frame/3/trmm/bli_trmm_int.c | 18 ++- frame/3/trmm/bli_trmm_ll_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 8 +- frame/3/trmm/bli_trmm_threading.c | 10 +- frame/3/trsm/bli_trsm_blk_var1b.c | 76 +++++++----- frame/3/trsm/bli_trsm_blk_var1b.h | 3 +- frame/3/trsm/bli_trsm_blk_var1f.c | 72 +++++++----- frame/3/trsm/bli_trsm_blk_var1f.h | 3 +- frame/3/trsm/bli_trsm_blk_var2b.c | 113 +++++++++++------- frame/3/trsm/bli_trsm_blk_var2b.h | 3 +- frame/3/trsm/bli_trsm_blk_var2f.c | 114 +++++++++++------- frame/3/trsm/bli_trsm_blk_var2f.h | 3 +- frame/3/trsm/bli_trsm_blk_var3b.c | 112 +++++++++++------- frame/3/trsm/bli_trsm_blk_var3b.h | 3 +- frame/3/trsm/bli_trsm_blk_var3f.c | 112 +++++++++++------- frame/3/trsm/bli_trsm_blk_var3f.h | 3 +- frame/3/trsm/bli_trsm_front.c | 22 ++-- frame/3/trsm/bli_trsm_int.c | 30 +++-- frame/3/trsm/bli_trsm_int.h | 3 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_ll_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_lu_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_rl_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_ru_ker_var2.h | 6 +- frame/3/trsm/bli_trsm_threading.c | 173 ++++++++++++++++++++++++++++ frame/3/trsm/bli_trsm_threading.h | 79 +++++++++++++ frame/base/bli_threading.c | 56 ++++++++- frame/base/bli_threading.h | 3 + 46 files changed, 889 insertions(+), 342 deletions(-) create mode 100644 frame/3/trsm/bli_trsm_threading.c create mode 100644 frame/3/trsm/bli_trsm_threading.h diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index f0647ccb3..a6a70181b 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -115,14 +115,6 @@ void bli_gemm_blk_var3f( obj_t* a, bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), gemm_thread_sub_ipackm( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); // Packing must be done before computation. thread_ibarrier( thread ); @@ -136,6 +128,14 @@ void bli_gemm_blk_var3f( obj_t* a, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread) ); + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c_pack is a local obj_t, we can simply overwrite the + // internal beta scalar with BLIS_ONE once it has been used in the + // first iteration. + if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + } thread_obarrier( thread ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 5218ab8c0..9c0adee84 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -79,7 +79,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -88,7 +90,9 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -106,23 +110,28 @@ void bli_gemm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } + thread_obarrier( thread ); // Extract the variant number and implementation type. n = cntl_var_num( cntl ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 7d0734e40..2d5cc7bca 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -249,11 +249,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 6d2ec5f1b..047b083cf 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -84,28 +84,17 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -dim_t read_env( char* env ) -{ - dim_t number = 1; - char* str = getenv( env ); - if( str != NULL ) - { - number = strtol( str, NULL, 10 ); - } - return number; -} - void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads ) { } gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index 54a8f4884..24bf6d734 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -53,6 +53,10 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_thread_sub_opackm( thread ) thread->opackm #define gemm_thread_sub_ipackm( thread ) thread->ipackm +// For use in gemm micro-kernel +#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ); void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** ); diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 3ef777247..5fcb56001 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -90,7 +90,8 @@ void bli_herk_blk_var2f( obj_t* a, dim_t start, end; // Needs to be replaced with a weighted range because triangle - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index 64fd7b1c4..0bc5c6a9b 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -89,7 +89,9 @@ void bli_herk_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -107,28 +109,34 @@ void bli_herk_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index c4d46718b..464e54588 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 942014883..2b291a924 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -90,11 +90,11 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads ) herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h index 05e038aab..d156547a8 100644 --- a/frame/3/herk/bli_herk_threading.h +++ b/frame/3/herk/bli_herk_threading.h @@ -53,6 +53,11 @@ typedef struct herk_thrinfo_s herk_thrinfo_t; #define herk_thread_sub_opackm( thread ) thread->opackm #define herk_thread_sub_ipackm( thread ) thread->ipackm +// For use in herk micro-kernel +#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) + + herk_thrinfo_t** bli_create_herk_thrinfo_paths( ); void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths ); diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 573738c0f..694f8a211 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -286,11 +286,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 2a211bdbc..86787a80a 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -82,7 +82,8 @@ void bli_trmm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index f1ccedd45..39033fcf3 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -82,7 +82,8 @@ void bli_trmm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 56327008b..0148a670b 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -111,7 +111,9 @@ void bli_trmm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -129,22 +131,26 @@ void bli_trmm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -164,6 +170,8 @@ void bli_trmm_int( obj_t* alpha, else uplo = 1; } + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 772d91816..b5950a603 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -320,11 +320,11 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1011 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * ss_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -381,11 +381,11 @@ void PASTEMAC(ch,varname)( \ if( trmm_l_ir_my_iter( i, ir_thread ) ) \ { \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 0622bbbb2..e4568c70c 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -325,11 +325,11 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1112 * ss_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -385,11 +385,11 @@ void PASTEMAC(ch,varname)( \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index f48baf4b3..296325ec8 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b1121 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b1121 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 97626a717..7f13e47a8 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -329,11 +329,11 @@ void PASTEMAC(ch,varname)( \ a1_i = a1 + off_b0111 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b0111 * ss_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -392,11 +392,11 @@ void PASTEMAC(ch,varname)( \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ + a2 = a1; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + cstep_b; \ + b2 = b1; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 0a9d83da2..3a6a7c0b4 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -90,11 +90,11 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads ) trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) { - dim_t jc_way = read_env( "BLIS_JC_NT" ); - dim_t kc_way = read_env( "BLIS_KC_NT" ); - dim_t ic_way = read_env( "BLIS_IC_NT" ); - dim_t jr_way = read_env( "BLIS_JR_NT" ); - dim_t ir_way = read_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 6d4681f35..66b3e9fc7 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -60,22 +79,16 @@ void bli_trsm_blk_var1b( obj_t* a, // A begins. if ( bli_obj_is_upper( *a ) ) offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) - - bli_obj_width_after_trans( *a ); + bli_obj_width_after_trans( *a ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_b( i, m_trans, a, + b_alg = bli_determine_blocksize_b( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -84,29 +97,34 @@ void bli_trsm_blk_var1b( obj_t* a, bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); - //if ( bli_obj_is_zeros( a1 ) ) continue; - // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1b.h b/frame/3/trsm/bli_trsm_blk_var1b.h index 4ced0fc92..99585b947 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.h +++ b/frame/3/trsm/bli_trsm_blk_var1b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index 8177e183b..0525db3be 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -37,20 +37,39 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b_pack; - obj_t c1; + obj_t b_pack_s; + obj_t a1_pack_s; + + obj_t a1, c1; + obj_t* b_pack = NULL; + obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b_pack ); + // Initialize object for packing B. + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &b_pack_s ); + bli_packm_init( b, &b_pack_s, + cntl_sub_packm_b( cntl ) ); + } + b_pack = thread_obroadcast( thread, &b_pack_s ); + + // Initialize object for packing B. + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + } + a1_pack = thread_obroadcast( thread, &a1_pack_s ); + + // Pack B1 (if instructed). + bli_packm_int( b, b_pack, + cntl_sub_packm_b( cntl ), + trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); @@ -61,20 +80,14 @@ void bli_trsm_blk_var1f( obj_t* a, if ( bli_obj_is_lower( *a ) ) offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); - // Initialize object for packing B. - bli_packm_init( b, &b_pack, - cntl_sub_packm_b( cntl ) ); - - // Pack B1 (if instructed). - bli_packm_int( b, &b_pack, - cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range( thread, offA, m_trans, 8, &start, &end ); // Partition along the remaining portion of the m dimension. - for ( i = offA; i < m_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, m_trans, a, + b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. @@ -84,26 +97,33 @@ void bli_trsm_blk_var1f( obj_t* a, i, b_alg, c, &c1 ); // Initialize object for packing A1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b_pack, + a1_pack, + b_pack, &BLIS_ONE, &c1, - cntl_sub_trsm( cntl ) ); + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a1_pack ); + if( thread_am_ichief( thread ) ) + bli_obj_release_pack( b_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var1f.h b/frame/3/trsm/bli_trsm_blk_var1f.h index c815c03ff..48384c369 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.h +++ b/frame/3/trsm/bli_trsm_blk_var1f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 724b88f2d..435c9dec3 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -37,40 +37,56 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, n_trans, b, @@ -83,38 +99,55 @@ void bli_trsm_blk_var2b( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2b.h b/frame/3/trsm/bli_trsm_blk_var2b.h index fb352ce39..de4a8f899 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.h +++ b/frame/3/trsm/bli_trsm_blk_var2b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 5e57ecee8..43b46b752 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -37,40 +37,57 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a_pack; - obj_t b1, b1_pack; - obj_t c1, c1_pack; + obj_t a_pack_s; + obj_t b1_pack_s, c1_pack_s; + + obj_t b1, c1; + obj_t* a_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c1_pack ); + // Initialize pack objects for A that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &a_pack_s ); + + // Initialize object for packing A. + bli_packm_init( a, &a_pack_s, + cntl_sub_packm_a( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + a_pack = thread_obroadcast( thread, &a_pack_s ); + + // Initialize pack objects for B and C that are passed into packm_init(). + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &b1_pack_s ); + bli_obj_init_pack( &c1_pack_s ); + } + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + c1_pack = thread_ibroadcast( thread, &c1_pack_s ); + + // Pack A (if instructed). + bli_packm_int( a, a_pack, + cntl_sub_packm_a( cntl ), + trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack, - cntl_sub_packm_a( cntl ) ); - - // Pack A (if instructed). - bli_packm_int( a, &a_pack, - cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + dim_t start, end; + //bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); // Partition along the n dimension. - for ( i = 0; i < n_trans; i += b_alg ) + for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, n_trans, b, @@ -83,38 +100,55 @@ void bli_trsm_blk_var2f( obj_t* a, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, &c1_pack, - cntl_sub_packm_c( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + bli_packm_init( &c1, c1_pack, + cntl_sub_packm_c( cntl ) ); + } + thread_ibarrier( thread ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). - bli_packm_int( &c1, &c1_pack, + bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a_pack, - &b1_pack, + a_pack, + b1_pack, &BLIS_ONE, - &c1_pack, - cntl_sub_trsm( cntl ) ); + c1_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - bli_unpackm_int( &c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); + // Currently must be done by 1 thread + if( thread_am_ichief( thread ) ) { + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ) ); + } + //Barrier to make sure unpacking is done before next iteration's packing of C + //Somehow, we'd like to make this a noop if packing isn't done. + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c1_pack ); + thread_obarrier( thread ); + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( a_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( b1_pack ); + bli_obj_release_pack( c1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var2f.h b/frame/3/trsm/bli_trsm_blk_var2f.h index 44eb38460..ade7f0bf4 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.h +++ b/frame/3/trsm/bli_trsm_blk_var2f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var2f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 252f2eef7..3e586cdfc 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -37,38 +37,51 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,45 +96,60 @@ void bli_trsm_blk_var3b( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3b.h b/frame/3/trsm/bli_trsm_blk_var3b.h index d8f6c8dc6..a1779dc67 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.h +++ b/frame/3/trsm/bli_trsm_blk_var3b.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index c59596090..2a3384a2b 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -37,38 +37,51 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { - obj_t a1, a1_pack; - obj_t b1, b1_pack; - obj_t c_pack; + obj_t c_pack_s; + obj_t a1_pack_s, b1_pack_s; + + obj_t a1, b1; + obj_t* a1_pack = NULL; + obj_t* b1_pack = NULL; + obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; - // Initialize all pack objects that are passed into packm_init(). - bli_obj_init_pack( &a1_pack ); - bli_obj_init_pack( &b1_pack ); - bli_obj_init_pack( &c_pack ); + // Initialize pack objects for C that are passed into packm_init(). + if( thread_am_ochief( thread ) ) { + bli_obj_init_pack( &c_pack_s ); + + // Initialize object for packing C. + bli_packm_init( c, &c_pack_s, + cntl_sub_packm_c( cntl ) ); + + // Scale C by beta (if instructed). + bli_scalm_int( &BLIS_ONE, + c, + cntl_sub_scalm( cntl ) ); + } + c_pack = thread_obroadcast( thread, &c_pack_s ); + + if( thread_am_ichief( thread ) ) { + bli_obj_init_pack( &a1_pack_s ); + bli_obj_init_pack( &b1_pack_s ); + } + a1_pack = thread_ibroadcast( thread, &a1_pack_s ); + b1_pack = thread_ibroadcast( thread, &b1_pack_s ); + + // Pack C (if instructed). + bli_packm_int( c, c_pack, + cntl_sub_packm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntl_sub_scalm( cntl ) ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack, - cntl_sub_packm_c( cntl ) ); - - // Pack C (if instructed). - bli_packm_int( c, &c_pack, - cntl_sub_packm_c( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); - // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { @@ -83,45 +96,60 @@ void bli_trsm_blk_var3f( obj_t* a, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. - bli_packm_init( &a1, &a1_pack, - cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, &b1_pack, - cntl_sub_packm_b( cntl ) ); + if( thread_am_ichief( thread ) ) { + bli_packm_init( &a1, a1_pack, + cntl_sub_packm_a( cntl ) ); + bli_packm_init( &b1, b1_pack, + cntl_sub_packm_b( cntl ) ); + } + thread_ibarrier( thread ); // Pack A1 (if instructed). - bli_packm_int( &a1, &a1_pack, + bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). - bli_packm_int( &b1, &b1_pack, + bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), - &BLIS_PACKM_SINGLE_THREADED ); + trsm_thread_sub_ipackm( thread ) ); + + // Packing must be done before computation + thread_ibarrier( thread ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, - &a1_pack, - &b1_pack, + a1_pack, + b1_pack, &BLIS_ONE, - &c_pack, - cntl_sub_trsm( cntl ) ); + c_pack, + cntl_sub_trsm( cntl ), + trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( &c_pack ); } + if ( i == 0 && thread_am_ichief( thread ) ) { + bli_obj_scalar_reset( a ); + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c_pack ); + } } + thread_obarrier( thread ); + // Unpack C (if C was packed). - bli_unpackm_int( &c_pack, c, - cntl_sub_unpackm_c( cntl ) ); + if( thread_am_ochief( thread ) ) { + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ) ); + bli_obj_release_pack( c_pack ); + } // If any packing buffers were acquired within packm, release them back // to the memory manager. - bli_obj_release_pack( &a1_pack ); - bli_obj_release_pack( &b1_pack ); - bli_obj_release_pack( &c_pack ); + if( thread_am_ichief( thread ) ) { + bli_obj_release_pack( a1_pack ); + bli_obj_release_pack( b1_pack ); + } } diff --git a/frame/3/trsm/bli_trsm_blk_var3f.h b/frame/3/trsm/bli_trsm_blk_var3f.h index 8546b0ba5..013d70bc1 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.h +++ b/frame/3/trsm/bli_trsm_blk_var3f.h @@ -35,5 +35,6 @@ void bli_trsm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 1dd67ece5..e7cae7d51 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -125,12 +125,20 @@ void bli_trsm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - // Invoke the internal back-end. - bli_trsm_int( alpha, - &a_local, - &b_local, - alpha, - &c_local, - cntl ); + trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths(); + dim_t n_threads = thread_num_threads( infos[0] ); + + // Invoke the internal back-end. + bli_level3_thread_decorator( n_threads, + (level3_int_t) bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + (void*) cntl, + (void**) infos ); + + bli_trsm_thrinfo_free_paths( infos ); } diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index db0fdf393..6644b3512 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -39,7 +39,8 @@ typedef void (*FUNCPTR_T)( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); static FUNCPTR_T vars[2][2][4][3] = { @@ -88,7 +89,8 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; @@ -109,7 +111,9 @@ void bli_trsm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - bli_scalm( beta, c ); + if( thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + thread_obarrier( thread ); return; } @@ -127,14 +131,17 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + if( thread_am_ochief( thread ) ) { + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + } } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -150,7 +157,8 @@ void bli_trsm_int( obj_t* alpha, // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) @@ -164,10 +172,13 @@ void bli_trsm_int( obj_t* alpha, // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &a_local ); + if( thread_am_ochief( thread ) ) + bli_obj_scalar_apply_scalar( alpha, &a_local ); } } + thread_obarrier( thread ); + // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); @@ -179,6 +190,7 @@ void bli_trsm_int( obj_t* alpha, f( &a_local, &b_local, &c_local, - cntl ); + cntl, + thread ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index 504f7928c..62a937b3c 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -37,4 +37,5 @@ void bli_trsm_int( obj_t* alpha, obj_t* b, obj_t* beta, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index bb0ed34db..0d31f656b 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.h b/frame/3/trsm/bli_trsm_ll_ker_var2.h index 59e8e576b..d13ab6f23 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index d86a87ca0..6d0efe5e8 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.h b/frame/3/trsm/bli_trsm_lu_ker_var2.h index 50b18cf79..c26d0081a 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.h +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 5d0288c40..3bc951bd5 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.h b/frame/3/trsm/bli_trsm_rl_ker_var2.h index a0605a7b7..8cc3c5fed 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.h +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 9bac5c946..6711ba423 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -47,7 +47,8 @@ typedef void (*FUNCPTR_T)( void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, - void* gemm_ukr + void* gemm_ukr, + trsm_thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); @@ -56,7 +57,8 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ) + trsm_t* cntl, + trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -139,7 +141,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, - gemm_ukr ); + gemm_ukr, + thread ); } @@ -157,7 +160,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ) \ { \ /* Cast the micro-kernels' addresses to their function pointer types. */ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.h b/frame/3/trsm/bli_trsm_ru_ker_var2.h index ebb24b81f..c07b215af 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.h @@ -39,7 +39,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, - trsm_t* cntl ); + trsm_t* cntl, + trsm_thrinfo_t* thread ); // @@ -59,7 +60,8 @@ void PASTEMAC(ch,varname)( \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ - void* gemm_ukr \ + void* gemm_ukr, \ + trsm_thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c new file mode 100644 index 000000000..08c915b15 --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "assert.h" + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->icomm = icomm; + thread->icomm_id = icomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->opackm = opackm; + thread->ipackm = ipackm; + thread->sub_trsm = sub_trsm; +} + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ) +{ + thread->ocomm = &BLIS_SINGLE_COMM; + thread->ocomm_id = 0; + thread->icomm = &BLIS_SINGLE_COMM; + thread->icomm_id = 0; + thread->n_way = 1; + thread->work_id = 0; + thread->opackm = &BLIS_PACKM_SINGLE_THREADED; + thread->ipackm = &BLIS_PACKM_SINGLE_THREADED; + thread->sub_trsm = thread; +} + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ) +{ + trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc( sizeof( trsm_thrinfo_t ) ); + bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id, + icomm, icomm_id, + n_way, work_id, + opackm, + ipackm, + sub_trsm ); + return thread; +} + +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads ) +{ +} + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) +{ + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; + assert( global_num_threads != 0 ); + + dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; + dim_t kc_nt = ic_way * jr_way * ir_way; + dim_t ic_nt = jr_way * ir_way; + dim_t jr_nt = ir_way; + dim_t ir_nt = 1; + + + trsm_thrinfo_t** paths = (trsm_thrinfo_t**) malloc( global_num_threads * sizeof( trsm_thrinfo_t* ) ); + + thread_comm_t* global_comm = bli_create_communicator( global_num_threads ); + for( int a = 0; a < jc_way; a++ ) + { + thread_comm_t* jc_comm = bli_create_communicator( jc_nt ); + for( int b = 0; b < kc_way; b++ ) + { + thread_comm_t* kc_comm = bli_create_communicator( kc_nt ); + for( int c = 0; c < ic_way; c++ ) + { + thread_comm_t* ic_comm = bli_create_communicator( ic_nt ); + for( int d = 0; d < jr_way; d++ ) + { + thread_comm_t* jr_comm = bli_create_communicator( jr_nt ); + for( int e = 0; e < ir_way; e++) + { + thread_comm_t* ir_comm = bli_create_communicator( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t kc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*kc_nt + kc_comm_id; + dim_t global_comm_id = a*jc_nt + jc_comm_id; + + trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id, + ir_comm, ir_comm_id, + ir_way, e, + NULL, NULL, NULL); + + trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + jr_way, d, + NULL, NULL, ir_info); + + packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); + + trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + ic_way, c, + packb, packa, jr_info); + + trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + kc_way, b, + NULL, NULL, ic_info); + + trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id, + jc_comm, jc_comm_id, + jc_way, a, + NULL, NULL, kc_info); + paths[global_comm_id] = jc_info; + } + } + } + } + } + return paths; +} diff --git a/frame/3/trsm/bli_trsm_threading.h b/frame/3/trsm/bli_trsm_threading.h new file mode 100644 index 000000000..30bc612bf --- /dev/null +++ b/frame/3/trsm/bli_trsm_threading.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +struct trsm_thrinfo_s //implements thrinfo_t +{ + thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level + dim_t ocomm_id; //Our thread id within that thread comm + thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level + dim_t icomm_id; //Our thread id within that thread comm + + dim_t n_way; //Number of distinct caucuses used to parallelize the loop + dim_t work_id; //What we're working on + + packm_thrinfo_t* opackm; + packm_thrinfo_t* ipackm; + struct trsm_thrinfo_s* sub_trsm; +}; +typedef struct trsm_thrinfo_s trsm_thrinfo_t; + +#define trsm_thread_sub_trsm( thread ) thread->sub_trsm +#define trsm_thread_sub_opackm( thread ) thread->opackm +#define trsm_thread_sub_ipackm( thread ) thread->ipackm + +#define trsm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) + +trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ); +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** ); + +void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, + thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id, + packm_thrinfo_t* opackm, + packm_thrinfo_t* ipackm, + trsm_thrinfo_t* sub_trsm ); + +void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread ); diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index c0ef641ea..0b9ec30bd 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -230,8 +230,51 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto *end = bli_min( *start + n_pt, size + all_start ); } -void bli_get_range_tri_weighted( void* thr, dim_t size, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) { + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + dim_t size = all_end - all_start; + + *start = all_start; + *end = all_end; + + if( forward ) { + dim_t curr_caucus = n_way - 1; + dim_t len = 0; + dim_t num = size*size / n_way; // 2xArea per thread? + while(1){ + dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if( curr_caucus == work_id ) { + if( *end > width ) + *start = *end - width; + return; + } + else{ + *end -= width; + len += width; + curr_caucus--; + } + } + } + else{ + dim_t len = *end - *start; + dim_t num = len * len / n_way; + while(1){ + dim_t width = sqrt(*start * *start + num) - *start; + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if(!work_id) { + *end = bli_min( *start + width, *end ); + return; + } + else{ + *start = *start + width; + } + work_id--; + } + } } void bli_level3_thread_decorator( dim_t n_threads, @@ -257,3 +300,14 @@ void bli_level3_thread_decorator( dim_t n_threads, thread[omp_id] ); } } + +dim_t bli_read_nway_from_env( char* env ) +{ + dim_t number = 1; + char* str = getenv( env ); + if( str != NULL ) + { + number = strtol( str, NULL, 10 ); + } + return number; +} diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index daaf2d6f4..f09da42c3 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -88,11 +88,13 @@ typedef struct thrinfo_s thrinfo_t; #define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id ) void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end); thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); +dim_t bli_read_nway_from_env( char* env ); //void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); //thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); @@ -100,6 +102,7 @@ void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm #include "bli_gemm_threading.h" #include "bli_herk_threading.h" #include "bli_trmm_threading.h" +#include "bli_trsm_threading.h" typedef void (*level3_int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, void* cntl, void* thread ); void bli_level3_thread_decorator( dim_t num_threads, From 73b3db594864be0f9be9a0eb29bf961fa9c95f29 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 26 Mar 2014 15:39:05 +0000 Subject: [PATCH 19/42] Some fixes for the bgq configuration --- config/bgq/bli_config.h | 15 +- config/bgq/bli_kernel.h | 30 +-- kernels/bgq/1/bli_axpyv_opt_var1.c | 4 +- kernels/bgq/1/bli_dotv_opt_var1.c | 4 +- kernels/bgq/3/bli_gemm_8x8.c | 315 ++++++++++++++++------------- kernels/bgq/3/bli_gemm_8x8.h | 3 +- 6 files changed, 199 insertions(+), 172 deletions(-) diff --git a/config/bgq/bli_config.h b/config/bgq/bli_config.h index 39a627a6a..234f39648 100644 --- a/config/bgq/bli_config.h +++ b/config/bgq/bli_config.h @@ -111,16 +111,16 @@ // Alignment size used when allocating memory dynamically from the operating // system (eg: posix_memalign()). To disable heap alignment and just use // malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 32 +#define BLIS_HEAP_ADDR_ALIGN_SIZE 64 // Alignment size used when sizing leading dimensions of dynamically // allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE 32 +#define BLIS_HEAP_STRIDE_ALIGN_SIZE 64 // Alignment size used when allocating entire blocks of contiguous memory // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32 // -- MIXED DATATYPE SUPPORT --------------------------------------------------- @@ -154,12 +154,13 @@ // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 // Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ +// Underscore is left out to work on BGQ systems +#define PASTEF770(name) name //## _ +#define PASTEF77(ch1,name) ch1 ## name //## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name //## _ diff --git a/config/bgq/bli_kernel.h b/config/bgq/bli_kernel.h index 02d9c89b5..cd2002545 100644 --- a/config/bgq/bli_kernel.h +++ b/config/bgq/bli_kernel.h @@ -54,8 +54,8 @@ // (b) NR (for triangular operations such as trmm and trsm). // -#define BLIS_DEFAULT_MC_S 256 -#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_MC_S 1024 +#define BLIS_DEFAULT_KC_S 2048 #define BLIS_DEFAULT_NC_S 8192 // 16 MPI RANKS CASE: @@ -64,17 +64,17 @@ // // 1 MPI RANK CASE: -#define BLIS_DEFAULT_MC_D 1008 -#define BLIS_DEFAULT_KC_D 2016 -#define BLIS_DEFAULT_NC_D 20480 +#define BLIS_DEFAULT_MC_D 1024 +#define BLIS_DEFAULT_KC_D 2048 +#define BLIS_DEFAULT_NC_D 10240 -#define BLIS_DEFAULT_MC_C 128 -#define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 4096 +#define BLIS_DEFAULT_MC_C 1024 +#define BLIS_DEFAULT_KC_C 2048 +#define BLIS_DEFAULT_NC_C 8192 -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 2048 +#define BLIS_DEFAULT_MC_Z 768 +#define BLIS_DEFAULT_KC_Z 1536 +#define BLIS_DEFAULT_NC_Z 10240 // -- Register blocksizes -- @@ -182,7 +182,7 @@ #include "bli_gemm_8x8.h" #define BLIS_DGEMM_UKERNEL bli_dgemm_8x8 -#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt +#define BLIS_ZGEMM_UKERNEL bli_zgemm_8x8 // -- trsm-related -- @@ -206,7 +206,7 @@ // -- axpyf -- -#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 +//#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 // -- dotxf -- @@ -221,13 +221,13 @@ // -- axpyv -- -#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 +//#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 // -- copyv -- // -- dotv -- -#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 +//#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 // -- dotxv -- diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.c b/kernels/bgq/1/bli_axpyv_opt_var1.c index a6fb43daa..a4334fd58 100644 --- a/kernels/bgq/1/bli_axpyv_opt_var1.c +++ b/kernels/bgq/1/bli_axpyv_opt_var1.c @@ -42,6 +42,7 @@ void bli_daxpyv_opt_var1( double* restrict y_in, inc_t incy ) { + { double* restrict alpha = alpha_in; double* restrict x = x_in; double* restrict y = y_in; @@ -65,7 +66,7 @@ void bli_daxpyv_opt_var1( dim_t n_left = n % 4; vector4double xv, yv, zv; - vector4double alphav = vec_lds( 0 * sizeof(double), alpha ); + vector4double alphav = vec_lds( 0 * sizeof(double), &alpha[0] ); #pragma omp parallel for for ( dim_t i = 0; i < n_run; i++ ) @@ -80,3 +81,4 @@ void bli_daxpyv_opt_var1( y[4*n_run + i] += *alpha * x[4*n_run + i]; } } +} diff --git a/kernels/bgq/1/bli_dotv_opt_var1.c b/kernels/bgq/1/bli_dotv_opt_var1.c index 7fceb2ec3..7c54c5d02 100644 --- a/kernels/bgq/1/bli_dotv_opt_var1.c +++ b/kernels/bgq/1/bli_dotv_opt_var1.c @@ -42,7 +42,7 @@ void bli_ddotv_opt_var1( double* restrict y_in, inc_t incy, double* restrict rho_in ) -{ +{{ double* restrict x = x_in; double* restrict y = y_in; double* rho = rho_in; @@ -94,5 +94,5 @@ void bli_ddotv_opt_var1( } *rho = rhos; -} +}} diff --git a/kernels/bgq/3/bli_gemm_8x8.c b/kernels/bgq/3/bli_gemm_8x8.c index 2c1842f41..2b8905553 100644 --- a/kernels/bgq/3/bli_gemm_8x8.c +++ b/kernels/bgq/3/bli_gemm_8x8.c @@ -33,28 +33,9 @@ */ #include "blis.h" - -void bli_sgemm_8x8( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) -{ - /* Just call the reference implementation. */ - BLIS_SGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - - +#undef restrict +#include +#include /* @@ -62,7 +43,7 @@ void bli_sgemm_8x8( * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread: * "Achievable peak can either be: * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz - * 2) 8.53 GF Takes into account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread + * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread" * * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. @@ -74,14 +55,14 @@ void bli_sgemm_8x8( */ void bli_dgemm_8x8( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) + dim_t k, + restrict double* alpha, + restrict double* a, + restrict double* b, + restrict double* beta, + restrict double* c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { //Registers for storing C. @@ -221,126 +202,170 @@ void bli_dgemm_8x8( UPDATE( AB, c, 4 ); } -void bli_dgemm_8x8_mt( - dim_t k, - double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t tid - ) +void printvec(vector4double v) { - bli_dgemm_8x8( k, - alpha, - a, - b, beta, - c, - rs_c, cs_c, - data ); -} - -void bli_cgemm_8x8( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) -{ - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + double a = vec_extract(v, 0); + double b = vec_extract(v, 1); + double c = vec_extract(v, 2); + double d = vec_extract(v, 3); + printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d); } void bli_zgemm_8x8( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) + dim_t k, + dcomplex* alpha_z, + dcomplex* a_z, + dcomplex* b_z, + dcomplex* beta_z, + dcomplex* c_z, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { - /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + double * alpha = (double*) alpha_z; + double * beta = (double*) beta_z; + double * a = (double*) a_z; + double * b = (double*) b_z; + double * c = (double*) c_z; + + //Registers for storing C. + //2 2x4 subblocks of C, c0, and c1 + //Each sub-block has 4 columns, 0, 1, 2, 3 + //Each column has 2 partial sum, a and b, and contains 2 complex numbers. + vector4double c00a = vec_splats( 0.0 ); + vector4double c00b = vec_splats( 0.0 ); + vector4double c01a = vec_splats( 0.0 ); + vector4double c01b = vec_splats( 0.0 ); + vector4double c02a = vec_splats( 0.0 ); + vector4double c02b = vec_splats( 0.0 ); + vector4double c03a = vec_splats( 0.0 ); + vector4double c03b = vec_splats( 0.0 ); + + vector4double c10a = vec_splats( 0.0 ); + vector4double c10b = vec_splats( 0.0 ); + vector4double c11a = vec_splats( 0.0 ); + vector4double c11b = vec_splats( 0.0 ); + vector4double c12a = vec_splats( 0.0 ); + vector4double c12b = vec_splats( 0.0 ); + vector4double c13a = vec_splats( 0.0 ); + vector4double c13b = vec_splats( 0.0 ); + + + vector4double b0, b1, b2, b3; + vector4double a0, a1; + + double _Complex tmp = 0.0; + for( dim_t i = 0; i < k; i++ ) + { + + b0 = vec_ld2a( 0 * sizeof(double), &b[8*i] ); + b1 = vec_ld2a( 2 * sizeof(double), &b[8*i] ); + b2 = vec_ld2a( 4 * sizeof(double), &b[8*i] ); + b3 = vec_ld2a( 6 * sizeof(double), &b[8*i] ); + + a0 = vec_lda ( 0 * sizeof(double), &a[8*i] ); + a1 = vec_lda ( 4 * sizeof(double), &a[8*i] ); + + c00a = vec_xmadd ( b0, a0, c00a ); + c00b = vec_xxcpnmadd( a0, b0, c00b ); + c01a = vec_xmadd ( b1, a0, c01a ); + c01b = vec_xxcpnmadd( a0, b1, c01b ); + + c02a = vec_xmadd ( b2, a0, c02a ); + c02b = vec_xxcpnmadd( a0, b2, c02b ); + c03a = vec_xmadd ( b3, a0, c03a ); + c03b = vec_xxcpnmadd( a0, b3, c03b ); + + + c10a = vec_xmadd ( b0, a1, c10a ); + c10b = vec_xxcpnmadd( a1, b0, c10b ); + c11a = vec_xmadd ( b1, a1, c11a ); + c11b = vec_xxcpnmadd( a1, b1, c11b ); + + c12a = vec_xmadd ( b2, a1, c12a ); + c12b = vec_xxcpnmadd( a1, b2, c12b ); + c13a = vec_xmadd ( b3, a1, c13a ); + c13b = vec_xxcpnmadd( a1, b3, c13b ); + + } + + // Create patterns for permuting the "b" parts of each vector + vector4double pattern = vec_gpci( 01032 ); + vector4double zed = vec_splats( 0.0 ); + + vector4double AB; + vector4double C = vec_splats( 0.0 ); + vector4double C1 = vec_splats( 0.0 ); + vector4double C2 = vec_splats( 0.0 ); + + double alphar = *alpha; + double alphai = *(alpha+1); + double betar = *beta; + double betai = *(beta+1); + vector4double alphav = vec_splats( 0.0 ); + vector4double betav = vec_splats( 0.0 ); + alphav = vec_insert( alphar, alphav, 0); + alphav = vec_insert( alphai, alphav, 1); + alphav = vec_insert( alphar, alphav, 2); + alphav = vec_insert( alphai, alphav, 3); + betav = vec_insert( betar, betav, 0); + betav = vec_insert( betai, betav, 1); + betav = vec_insert( betar, betav, 2); + betav = vec_insert( betai, betav, 3); + double ct; + + + //Macro to update 2 elements of C in a column. + //REG1 is the register holding the first partial sum of those 2 elements + //REG2 is the register holding the second partial sum of those 2 elements + //ADDR is the address to write them to + //OFFSET is the number of rows from ADDR to write to +#define ZUPDATE( REG1, REG2, ADDR, OFFSET ) \ +{ \ + ct = *(ADDR + (OFFSET + 0) * rs_c); \ + C = vec_insert( ct, C, 0 ); \ + ct = *(ADDR + (OFFSET + 0) * rs_c + 1); \ + C = vec_insert( ct, C, 1 ); \ + ct = *(ADDR + (OFFSET + 2) * rs_c); \ + C = vec_insert( ct, C, 2 ); \ + ct = *(ADDR + (OFFSET + 2) * rs_c + 1); \ + C = vec_insert( ct, C, 3 ); \ + \ + AB = vec_sub(REG1, REG2 ); \ + \ + /* Scale by alpha */ \ + REG1 = vec_xmadd( alphav, AB, zed ); \ + REG2 = vec_xxcpnmadd( AB, alphav, zed ); \ + AB = vec_sub(REG1, REG2 ); \ + \ + \ + /* Scale by beta */ \ + REG1 = vec_xmadd( betav, C, zed ); \ + REG2 = vec_xxcpnmadd( C, betav, zed ); \ + C = vec_sub(REG1, REG2 ); \ + \ + /* Add AB to C */ \ + C = vec_add( AB, C ); \ + \ + ct = vec_extract( C, 0 ); \ + *(ADDR + (OFFSET + 0) * rs_c) = ct; \ + ct = vec_extract( C, 1 ); \ + *(ADDR + (OFFSET + 0) * rs_c + 1) = ct; \ + ct = vec_extract( C, 2 ); \ + *(ADDR + (OFFSET + 2) * rs_c) = ct; \ + ct = vec_extract( C, 3 ); \ + *(ADDR + (OFFSET + 2) * rs_c + 1) = ct; \ } -void bli_sgemm_8x8_mt( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_SGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - -void bli_cgemm_8x8_mt( - dim_t k, - scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - -void bli_zgemm_8x8_mt( - dim_t k, - dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - dim_t t_id - ) -{ - /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); + ZUPDATE( c00a, c00b, c, 0 ); + ZUPDATE( c10a, c10b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c01a, c01b, c, 0 ); + ZUPDATE( c11a, c11b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c02a, c02b, c, 0 ); + ZUPDATE( c12a, c12b, c, 4 ); + c += 2*cs_c; + ZUPDATE( c03a, c03b, c, 0 ); + ZUPDATE( c13a, c13b, c, 4 ); } diff --git a/kernels/bgq/3/bli_gemm_8x8.h b/kernels/bgq/3/bli_gemm_8x8.h index 75401eecb..b6ce51824 100644 --- a/kernels/bgq/3/bli_gemm_8x8.h +++ b/kernels/bgq/3/bli_gemm_8x8.h @@ -63,8 +63,7 @@ void PASTEMAC(ch,varname)( \ ctype* b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - dim_t tid \ + auxinfo_t* data \ ); INSERT_GENTPROT_BASIC( gemm_8x8_mt ) From a6fd48345424e097f71652be013aa897e098b41e Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 26 Mar 2014 17:19:46 +0000 Subject: [PATCH 20/42] Added test drivers for level 3 BLAS that run tests in parallel using MPI --- mpi_test/Makefile | 323 ++++++++++++++++++++++++++++++++++++++++++ mpi_test/test_gemm.c | 232 ++++++++++++++++++++++++++++++ mpi_test/test_hemm.c | 252 ++++++++++++++++++++++++++++++++ mpi_test/test_her2k.c | 209 +++++++++++++++++++++++++++ mpi_test/test_herk.c | 200 ++++++++++++++++++++++++++ mpi_test/test_trmm.c | 246 ++++++++++++++++++++++++++++++++ mpi_test/test_trsm.c | 282 ++++++++++++++++++++++++++++++++++++ 7 files changed, 1744 insertions(+) create mode 100644 mpi_test/Makefile create mode 100644 mpi_test/test_gemm.c create mode 100644 mpi_test/test_hemm.c create mode 100644 mpi_test/test_her2k.c create mode 100644 mpi_test/test_herk.c create mode 100644 mpi_test/test_trmm.c create mode 100644 mpi_test/test_trsm.c diff --git a/mpi_test/Makefile b/mpi_test/Makefile new file mode 100644 index 000000000..cb317471a --- /dev/null +++ b/mpi_test/Makefile @@ -0,0 +1,323 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all \ + blis essl \ + clean cleanx + + + +# +# --- Makefile initialization -------------------------------------------------- +# + +# Define the name of the configuration file. +CONFIG_MK_FILE := config.mk + +# Define the name of the file containing build and architecture-specific +# makefile definitions. +MAKE_DEFS_FILE := make_defs.mk + +# Locations of important files. +ROOT_PATH := .. +CONFIG_DIR := config + + + +# +# --- Include makefile configuration file -------------------------------------- +# + +# Construct the path to the makefile configuration file that was generated by +# the configure script. +CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE) + +# Include the configuration file. +-include $(CONFIG_MK_PATH) + +# Detect whether we actually got the configuration file. If we didn't, then +# it is likely that the user has not yet generated it (via configure). +ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes) +CONFIG_MK_PRESENT := yes +else +CONFIG_MK_PRESENT := no +endif + +# Now we have access to CONFIG_NAME, which tells us which sub-directory of the +# config directory to use as our configuration. +CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME) + + + +# +# --- Include makefile definitions file ---------------------------------------- +# + +# Construct the path to the makefile definitions file residing inside of +# the configuration sub-directory. +MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE) + +# Include the makefile definitions file. +-include $(MAKE_DEFS_MK_PATH) + +# Detect whether we actually got the make definitios file. If we didn't, then +# it is likely that the configuration is invalid (or incomplete). +ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes) +MAKE_DEFS_MK_PRESENT := yes +else +MAKE_DEFS_MK_PRESENT := no +endif + + + +# +# --- BLAS and LAPACK implementations ------------------------------------------ +# + +# BLIS library and header path. This is simply wherever it was installed. +BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib +BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis + +# BLIS library. +BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a + +# BLAS library path(s). This is where the BLAS libraries reside. +BLAS_LIB_PATH := $(HOME)/flame/lib +MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64/ +ESSL_LIB_PATH := /soft/libraries/essl/current/lib64 + +# OpenBLAS +OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a + +# ATLAS +ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \ + $(BLAS_LIB_PATH)/libatlas.a + +# MKL +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_sequential \ + -lmkl_core \ + -lmkl_intel_lp64 + +# ESSL +# Note: ESSL is named differently for SMP and/or BG +ESSL_LIB := $(ESSL_LIB_PATH)/libesslsmpbg.a \ + -L$(IBM_MAIN_DIR)/xlsmp/bg/3.1/bglib64/ \ + -L$(IBM_MAIN_DIR)/xlf/bg/14.1/bglib64/ \ + -lxlsmp -lxlf90_r -lxlfmath -lxl + +# Accelerate +MAC_LIB := -framework Accelerate + + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c)) + +# Override CFLAGS from make_defs.mk here, if desired. +#CFLAGS := -g -O2 -march=native + +# Add installed and local header paths to CFLAGS +CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) + +LINKER := $(CC) +#LDFLAGS := -L/home/00146/field/gnu/gcc-4.8.2/lib64 +#LDFLAGS += -lgfortran -lm -lpthread + + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +# Complete list of possible targets when defining 'all': +# +# blis openblas atlas mkl mac essl +# +all: blis essl + +blis: test_gemm_blis.x \ + test_hemm_blis.x \ + test_herk_blis.x \ + test_her2k_blis.x \ + test_trmm_blis.x \ + test_trsm_blis.x + +essl: test_gemm_essl.x \ + test_hemm_essl.x \ + test_herk_essl.x \ + test_her2k_essl.x \ + test_trmm_essl.x \ + test_trsm_essl.x + +openblas: test_gemv_openblas.x \ + test_ger_openblas.x \ + test_hemv_openblas.x \ + test_her_openblas.x \ + test_her2_openblas.x \ + test_trmv_openblas.x \ + test_trsv_openblas.x \ + \ + test_gemm_openblas.x \ + test_hemm_openblas.x \ + test_herk_openblas.x \ + test_her2k_openblas.x \ + test_trmm_openblas.x \ + test_trsm_openblas.x + +atlas: test_gemv_atlas.x \ + test_ger_atlas.x \ + test_hemv_atlas.x \ + test_her_atlas.x \ + test_her2_atlas.x \ + test_trmv_atlas.x \ + test_trsv_atlas.x \ + \ + test_gemm_atlas.x \ + test_hemm_atlas.x \ + test_herk_atlas.x \ + test_her2k_atlas.x \ + test_trmm_atlas.x \ + test_trsm_atlas.x + +mkl: test_gemv_mkl.x \ + test_ger_mkl.x \ + test_hemv_mkl.x \ + test_her_mkl.x \ + test_her2_mkl.x \ + test_trmv_mkl.x \ + test_trsv_mkl.x \ + \ + test_gemm_mkl.x \ + test_hemm_mkl.x \ + test_herk_mkl.x \ + test_her2k_mkl.x \ + test_trmm_mkl.x \ + test_trsm_mkl.x + +mac: test_gemv_mac.x \ + test_ger_mac.x \ + test_hemv_mac.x \ + test_her_mac.x \ + test_her2_mac.x \ + test_trmv_mac.x \ + test_trsv_mac.x \ + \ + test_gemm_mac.x \ + test_hemm_mac.x \ + test_herk_mac.x \ + test_her2k_mac.x \ + test_trmm_mac.x \ + test_trsm_mac.x + + + +# --Object file rules -- + +$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c + $(CC) $(CFLAGS) -c $< -o $@ + +test_%_openblas.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@ + +test_%_atlas.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@ + +test_%_mkl.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@ + +test_%_essl.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@ + +test_%_mac.o: test_%.c + $(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@ + +test_%_blis.o: test_%.c + $(CC) $(CFLAGS) -DBLIS -c $< -o $@ + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_openblas.x: test_%_openblas.o $(BLIS_LIB) + $(LINKER) $< $(OPENBLAS_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_atlas.x: test_%_atlas.o $(BLIS_LIB) + $(LINKER) $< $(ATLAS_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_mkl.x: test_%_mkl.o $(BLIS_LIB) + $(LINKER) $< $(MKL_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_essl.x: test_%_essl.o $(BLIS_LIB) + $(LINKER) $< $(ESSL_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_mac.x: test_%_mac.o $(BLIS_LIB) + $(LINKER) $< $(MAC_LIB) $(BLIS_LIB) $(LDFLAGS) -o $@ + +test_%_blis.x: test_%_blis.o $(BLIS_LIB) + $(LINKER) $< $(BLIS_LIB) $(LDFLAGS) -o $@ + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.o *.x + diff --git a/mpi_test/test_gemm.c b/mpi_test/test_gemm.c new file mode 100644 index 000000000..5864e667a --- /dev/null +++ b/mpi_test/test_gemm.c @@ -0,0 +1,232 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// transa transb m n k alpha a lda b ldb beta c ldc +//void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input, k_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_b, k, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + + bli_setsc( (0.9/1.0), 0.2, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef BLIS + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_gemm( &alpha, + //bli_gemm4m( &alpha, + &a, + &b, + &beta, + &c ); + +#else + if ( bli_is_real( dt_a ) ) + { + f77_char transa = 'N'; + f77_char transb = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dgemm_( &transa, + &transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else + { + f77_char transa = 'N'; + f77_char transb = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + dcomplex* alphap = bli_obj_buffer( alpha ); + dcomplex* ap = bli_obj_buffer( a ); + dcomplex* bp = bli_obj_buffer( b ); + dcomplex* betap = bli_obj_buffer( beta ); + dcomplex* cp = bli_obj_buffer( c ); + + zgemm_( &transa, + //zgemm3m_( &transa, + &transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_gemm_blis" ); +#else + printf( "data_gemm_%s", BLAS ); +#endif + printf( "( %2lu, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, dtime_save, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_hemm.c b/mpi_test/test_hemm.c new file mode 100644 index 000000000..4cab93ceb --- /dev/null +++ b/mpi_test/test_hemm.c @@ -0,0 +1,252 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uploa m n alpha a lda b ldb beta c ldc +//void dsymm_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, a ); + bli_obj_set_uplo( uplo, a ); + + // Randomize A, make it densely Hermitian, and zero the unstored + // triangle to ensure the implementation reads only from the stored + // region. + bli_randm( &a ); + bli_mkherm( &a ); + bli_mktrim( &a ); +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( 1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( -1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + bli_scalm( &BLIS_TWO, &a ); +*/ + + + bli_setsc( (2.0/1.0), 1.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_hemm( side, + //bli_hemm4m( side, + &alpha, + &a, + &b, + &beta, + &c ); +#else + + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsymm_( &side, + &uplo, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%9.5f", "" ); + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 2.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 2.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_hemm_blis" ); +#else + printf( "data_hemm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_her2k.c b/mpi_test/test_her2k.c new file mode 100644 index 000000000..f44ca4fb7 --- /dev/null +++ b/mpi_test/test_her2k.c @@ -0,0 +1,209 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// uploa transa m k alpha a lda b ldb beta c ldc +//void dsyr2k_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; + + uplo = BLIS_LOWER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_b, m, k, 0, 0, &b ); + bli_obj_create( dt_c, m, m, 0, 0, &c ); + bli_obj_create( dt_c, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, c ); + bli_obj_set_uplo( uplo, c ); + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_her2k( &alpha, + &a, + &b, + &beta, + &c ); + +#else + + f77_char uploa = 'L'; + f77_char transa = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* bp = bli_obj_buffer( b ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsyr2k_( &uploa, + &transa, + &mm, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_her2k_blis" ); +#else + printf( "data_her2k_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_herk.c b/mpi_test/test_herk.c new file mode 100644 index 000000000..ffe9ab85f --- /dev/null +++ b/mpi_test/test_herk.c @@ -0,0 +1,200 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// uploa transa m k alpha a lda beta c ldc +//void dsyrk_( char*, char*, int*, int*, double*, double*, int*, double*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, k_input; + num_t dt_a, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + k_input = strtol( argv[3], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + + dt_a = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; + + uplo = BLIS_LOWER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_a, m, k, 0, 0, &a ); + bli_obj_create( dt_c, m, m, 0, 0, &c ); + bli_obj_create( dt_c, m, m, 0, 0, &c_save ); + + bli_randm( &a ); + bli_randm( &c ); + + bli_obj_set_struc( BLIS_HERMITIAN, c ); + bli_obj_set_uplo( uplo, c ); + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_herk( &alpha, + &a, + &beta, + &c ); + +#else + + f77_char uploa = 'L'; + f77_char transa = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* betap = bli_obj_buffer( beta ); + double* cp = bli_obj_buffer( c ); + + dsyrk_( &uploa, + &transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_herk_blis" ); +#else + printf( "data_herk_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_trmm.c b/mpi_test/test_trmm.c new file mode 100644 index 000000000..4d8112be8 --- /dev/null +++ b/mpi_test/test_trmm.c @@ -0,0 +1,246 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uplo trans diag m n alpha a lda b ldb +//void dtrmm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_obj_set_struc( BLIS_TRIANGULAR, a ); + bli_obj_set_uplo( uplo, a ); + + bli_randm( &a ); + bli_randm( &c ); + bli_randm( &b ); + +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( -1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( 1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + //bli_scalm( &BLIS_TWO, &a ); +*/ + + + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + + +#ifdef PRINT + +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_trmm( side, + //bli_trmm4m( side, + &alpha, + &a, + &c ); + +#else + + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + double* alphap = bli_obj_buffer( alpha ); + double* ap = bli_obj_buffer( a ); + double* cp = bli_obj_buffer( c ); + + dtrmm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_trmm_blis" ); +#else + printf( "data_trmm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + diff --git a/mpi_test/test_trsm.c b/mpi_test/test_trsm.c new file mode 100644 index 000000000..563bbdaaf --- /dev/null +++ b/mpi_test/test_trsm.c @@ -0,0 +1,282 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" +#include + +// side uplo trans diag m n alpha a lda b ldb +//void dtrsm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input; + num_t dt_a, dt_b, dt_c; + num_t dt_alpha, dt_beta; + int r, n_repeats; + side_t side; + uplo_t uplo; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + + if( argc < 7 ) + { + printf("Usage:\n"); + printf("test_foo.x m n k p_begin p_inc p_end:\n"); + exit; + } + + int world_size, world_rank, provided; + MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); + MPI_Comm_size( MPI_COMM_WORLD, &world_size ); + MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); + + m_input = strtol( argv[1], NULL, 10 ); + n_input = strtol( argv[2], NULL, 10 ); + p_begin = strtol( argv[4], NULL, 10 ); + p_inc = strtol( argv[5], NULL, 10 ); + p_end = strtol( argv[6], NULL, 10 ); + +#if 1 + dt_a = BLIS_DOUBLE; + dt_b = BLIS_DOUBLE; + dt_c = BLIS_DOUBLE; + dt_alpha = BLIS_DOUBLE; + dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_FLOAT; + //dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_SCOMPLEX; +#endif + + side = BLIS_LEFT; + //side = BLIS_RIGHT; + + uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; + + for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) + { + + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + if ( bli_is_left( side ) ) + bli_obj_create( dt_a, m, m, 0, 0, &a ); + else + bli_obj_create( dt_a, n, n, 0, 0, &a ); + bli_obj_create( dt_b, m, n, 0, 0, &b ); + bli_obj_create( dt_c, m, n, 0, 0, &c ); + bli_obj_create( dt_c, m, n, 0, 0, &c_save ); + + bli_obj_set_struc( BLIS_TRIANGULAR, a ); + bli_obj_set_uplo( uplo, a ); + //bli_obj_set_diag( BLIS_UNIT_DIAG, a ); + + bli_randm( &a ); + bli_randm( &c ); + bli_randm( &b ); + +/* + { + obj_t a2; + + bli_obj_alias_to( a, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_inc_diag_off( 1, a2 ); + bli_setm( &BLIS_ZERO, &a2 ); + bli_obj_inc_diag_off( -2, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a2 ); + bli_scalm( &BLIS_TWO, &a2 ); + //bli_scalm( &BLIS_TWO, &a ); + } +*/ + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + + bli_copym( &c, &c_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + + +#ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + + bli_invertd( &a ); + bli_printm( "a", &a, "%4.1f", "" ); + bli_invertd( &a ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifdef BLIS + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + bli_trsm( side, + //bli_trsm4m( side, + //bli_trsm3m( side, + &alpha, + &a, + &c ); +#else + + if ( bli_is_real( dt_a ) ) + { + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + float * alphap = bli_obj_buffer( alpha ); + float * ap = bli_obj_buffer( a ); + float * cp = bli_obj_buffer( c ); + + strsm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); + } + else // if ( bli_is_complex( dt_a ) ) + { + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + scomplex* alphap = bli_obj_buffer( alpha ); + scomplex* ap = bli_obj_buffer( a ); + scomplex* cp = bli_obj_buffer( c ); + + ctrsm_( &side, + //ztrsm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); + } + +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + if ( bli_is_left( side ) ) + gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); + else + gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + +#ifdef BLIS + printf( "data_trsm_blis" ); +#else + printf( "data_trsm_%s", BLAS ); +#endif + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", + ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, dtime_save, gflops ); + + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + bli_finalize(); + + return 0; +} + From 9f78ec6e7e95fcad89a167b27cad7e2d74b6d122 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Mar 2014 14:18:46 -0500 Subject: [PATCH 21/42] Some fixes for the internal functions, was innappropriately only having thread chief do some things. --- frame/3/gemm/bli_gemm_int.c | 11 ++++------- frame/3/herk/bli_herk_int.c | 14 ++++---------- frame/3/trmm/bli_trmm_int.c | 14 ++++---------- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 9c0adee84..5f59c43c8 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -110,28 +110,25 @@ void bli_gemm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - if( thread_am_ochief( thread ) ) { + //if( thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - } + // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } - thread_obarrier( thread ); // Extract the variant number and implementation type. n = cntl_var_num( cntl ); diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/bli_herk_int.c index 0bc5c6a9b..f604a55c6 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/bli_herk_int.c @@ -109,34 +109,28 @@ void bli_herk_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - if( thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - } + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; - thread_obarrier( thread ); - // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/bli_trmm_int.c index 0148a670b..038348dd7 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/bli_trmm_int.c @@ -131,26 +131,22 @@ void bli_trmm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - if( thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - } + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - if( thread_am_ochief( thread ) ) - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -170,8 +166,6 @@ void bli_trmm_int( obj_t* alpha, else uplo = 1; } - thread_obarrier( thread ); - // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); From 459dde4acc09e49380da58fb7b246db488884ad9 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 27 Mar 2014 17:06:45 -0500 Subject: [PATCH 22/42] Made barrier after packing implicit. This also fixed a bug where barriers in the blocked variants were inserted after the inner packing routines, but not the outer packing routines. This allowed, for instance, the block of B to not be finished being packed before computation to occur. --- frame/1m/packm/bli_packm_int.c | 3 +++ frame/3/gemm/bli_gemm_blk_var1f.c | 3 --- frame/3/gemm/bli_gemm_blk_var2f.c | 3 --- frame/3/gemm/bli_gemm_blk_var3f.c | 3 --- frame/3/herk/bli_herk_blk_var1f.c | 3 --- frame/3/herk/bli_herk_blk_var3f.c | 3 --- frame/3/trmm/bli_trmm_blk_var1f.c | 3 --- frame/3/trmm/bli_trmm_blk_var2b.c | 3 --- frame/3/trmm/bli_trmm_blk_var2f.c | 3 --- frame/3/trmm/bli_trmm_blk_var3b.c | 3 --- frame/3/trmm/bli_trmm_blk_var3f.c | 3 --- frame/3/trsm/bli_trsm_blk_var2b.c | 3 --- frame/3/trsm/bli_trsm_blk_var2f.c | 3 --- frame/3/trsm/bli_trsm_blk_var3b.c | 3 --- frame/3/trsm/bli_trsm_blk_var3f.c | 3 --- 15 files changed, 3 insertions(+), 42 deletions(-) diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 2a7dc991c..a3d89b679 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -123,5 +123,8 @@ void bli_packm_int( obj_t* a, f( a, p, thread ); + + // Barrier so that packing is done before computation + thread_obarrier( thread ); } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index c3e5db6c0..2ba71c536 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -119,9 +119,6 @@ void bli_gemm_blk_var1f( obj_t* a, bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); - - // Packing must be done before computation. - thread_ibarrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index 82aad8b3d..cab440c55 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -119,9 +119,6 @@ void bli_gemm_blk_var2f( obj_t* a, cntl_sub_packm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a_pack, diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index a6a70181b..c5af97f94 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -115,9 +115,6 @@ void bli_gemm_blk_var3f( obj_t* a, bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), gemm_thread_sub_ipackm( thread ) ); - - // Packing must be done before computation. - thread_ibarrier( thread ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 899aa194c..7d4fa4375 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -116,9 +116,6 @@ void bli_herk_blk_var1f( obj_t* a, cntl_sub_packm_c( cntl ), herk_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform herk subproblem. bli_herk_int( &BLIS_ONE, a1_pack, diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 78e3cd30e..4c86adc8e 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -120,9 +120,6 @@ void bli_herk_blk_var3f( obj_t* a, // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - // Packing must be done before computation - thread_ibarrier( thread ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index ac1973366..e71ec01b6 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -128,9 +128,6 @@ void bli_trmm_blk_var1f( obj_t* a, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); - // Packing must be finished before computation - thread_ibarrier( thread ); - // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a1_pack, diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 86787a80a..dae4f04e7 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -116,9 +116,6 @@ void bli_trmm_blk_var2b( obj_t* a, bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); - - // Packing must be finished before computation - thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 39033fcf3..d0959fc6e 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -116,9 +116,6 @@ void bli_trmm_blk_var2f( obj_t* a, bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); - - // Packing must be finished before computation - thread_ibarrier( thread ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 40e9e21d6..f2ccd38a6 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -113,9 +113,6 @@ void bli_trmm_blk_var3b( obj_t* a, cntl_sub_packm_b( cntl ), trmm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a1_pack, diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 80293e42f..c361d6b23 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -113,9 +113,6 @@ void bli_trmm_blk_var3f( obj_t* a, cntl_sub_packm_b( cntl ), trmm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a1_pack, diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 435c9dec3..eadda1c37 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -117,9 +117,6 @@ void bli_trsm_blk_var2b( obj_t* a, cntl_sub_packm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a_pack, diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 43b46b752..e81875a4d 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -118,9 +118,6 @@ void bli_trsm_blk_var2f( obj_t* a, cntl_sub_packm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a_pack, diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 3e586cdfc..680353d68 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -114,9 +114,6 @@ void bli_trsm_blk_var3b( obj_t* a, cntl_sub_packm_b( cntl ), trsm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 2a3384a2b..80d15b477 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -114,9 +114,6 @@ void bli_trsm_blk_var3f( obj_t* a, cntl_sub_packm_b( cntl ), trsm_thread_sub_ipackm( thread ) ); - // Packing must be done before computation - thread_ibarrier( thread ); - // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, From 1584ae1c83c3a8c1af76acb46404747507650f19 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 28 Mar 2014 15:15:48 -0500 Subject: [PATCH 23/42] Fixed race condition involving scalar reset --- frame/3/herk/bli_herk_blk_var3f.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 4c86adc8e..931c6e9eb 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -112,14 +112,6 @@ void bli_herk_blk_var3f( obj_t* a, bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), herk_thread_sub_ipackm( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - if ( i != 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, @@ -130,6 +122,14 @@ void bli_herk_blk_var3f( obj_t* a, cntl_sub_herk( cntl ), herk_thread_sub_herk( thread ) ); + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c_pack is a local obj_t, we can simply overwrite the + // internal beta scalar with BLIS_ONE once it has been used in the + // first iteration. + if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); + } thread_obarrier( thread ); From 2041c264517b6c590fd4f7e8253e6911b622d1c3 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 3 Apr 2014 10:30:03 -0500 Subject: [PATCH 24/42] Added barriers needed prior to doing scalar reset for rank-k updates. --- frame/3/gemm/bli_gemm_blk_var3f.c | 1 + frame/3/herk/bli_herk_blk_var3f.c | 3 ++- frame/3/trsm/bli_trsm_blk_var3b.c | 1 + frame/3/trsm/bli_trsm_blk_var3f.c | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index c5af97f94..3f723d43c 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -131,6 +131,7 @@ void bli_gemm_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. + if ( i == 0 ) thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 931c6e9eb..96e9da471 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -112,7 +112,7 @@ void bli_herk_blk_var3f( obj_t* a, bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), herk_thread_sub_ipackm( thread ) ); - + // Perform herk subproblem. bli_herk_int( &BLIS_ONE, a1_pack, @@ -128,6 +128,7 @@ void bli_herk_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. + if ( i == 0 ) thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 680353d68..b43f9f0f8 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -126,6 +126,7 @@ void bli_trsm_blk_var3b( obj_t* a, // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. + if ( i == 0 ) thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 80d15b477..84ad3ed16 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -126,6 +126,7 @@ void bli_trsm_blk_var3f( obj_t* a, // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. + if ( i == 0 ) thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); From 4e3eb39aca4df0b9fdc003d468f368a2f2ba597d Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 4 Apr 2014 14:50:03 +0000 Subject: [PATCH 25/42] Some fixes to the bgq config MR and NR for double complex were wrong Default fusing factor for double precision was wrong as well --- config/bgq/bli_kernel.h | 15 +++++---------- kernels/bgq/1/bli_axpyv_opt_var1.c | 1 - 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/config/bgq/bli_kernel.h b/config/bgq/bli_kernel.h index cd2002545..80065ec06 100644 --- a/config/bgq/bli_kernel.h +++ b/config/bgq/bli_kernel.h @@ -58,11 +58,6 @@ #define BLIS_DEFAULT_KC_S 2048 #define BLIS_DEFAULT_NC_S 8192 -// 16 MPI RANKS CASE: -//#define BLIS_DEFAULT_MC_D 256//1024 -//#define BLIS_DEFAULT_KC_D 512//2048 -// - // 1 MPI RANK CASE: #define BLIS_DEFAULT_MC_D 1024 #define BLIS_DEFAULT_KC_D 2048 @@ -87,7 +82,7 @@ #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 -#define BLIS_DEFAULT_MR_Z 8 +#define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 // NOTE: If the micro-kernel, which is typically unrolled to a factor @@ -153,7 +148,7 @@ // -- Default fusing factors for level-1f operations -- #define BLIS_L1F_FUSE_FAC_S 8 -#define BLIS_L1F_FUSE_FAC_D 4 +#define BLIS_L1F_FUSE_FAC_D 8 #define BLIS_L1F_FUSE_FAC_C 4 #define BLIS_L1F_FUSE_FAC_Z 2 @@ -206,7 +201,7 @@ // -- axpyf -- -//#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 +#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 // -- dotxf -- @@ -221,13 +216,13 @@ // -- axpyv -- -//#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 +#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 // -- copyv -- // -- dotv -- -//#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 +#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 // -- dotxv -- diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.c b/kernels/bgq/1/bli_axpyv_opt_var1.c index a4334fd58..4c1c61af6 100644 --- a/kernels/bgq/1/bli_axpyv_opt_var1.c +++ b/kernels/bgq/1/bli_axpyv_opt_var1.c @@ -57,7 +57,6 @@ void bli_daxpyv_opt_var1( } // Call the reference implementation if needed. if ( use_ref == TRUE ) { - printf("Defaulting to reference!"); BLIS_DAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy ); return; } From ec58a7923cccac08632670caadf3cf6ff5dce766 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 10:22:48 -0500 Subject: [PATCH 26/42] Freeing thread info paths. Also made herk IC and JC loops do weighted partitioning --- frame/1m/packm/bli_packm_threading.c | 9 ++++++++- frame/1m/packm/bli_packm_threading.h | 1 + frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_threading.c | 23 ++++++++++++++++++++++- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/her2k/bli_her2k_front.c | 2 +- frame/3/herk/bli_herk_blk_var1f.c | 2 +- frame/3/herk/bli_herk_blk_var2f.c | 2 +- frame/3/herk/bli_herk_front.c | 2 +- frame/3/herk/bli_herk_threading.c | 21 ++++++++++++++++++++- frame/3/symm/bli_symm_front.c | 2 +- frame/3/syr2k/bli_syr2k_front.c | 2 +- frame/3/syrk/bli_syrk_front.c | 2 +- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/bli_trmm_threading.c | 22 +++++++++++++++++++++- frame/3/trmm3/bli_trmm3_front.c | 2 +- frame/3/trsm/bli_trsm_front.c | 2 +- frame/3/trsm/bli_trsm_threading.c | 21 ++++++++++++++++++++- frame/base/bli_threading.c | 6 ++++++ frame/base/bli_threading.h | 2 ++ 20 files changed, 112 insertions(+), 17 deletions(-) diff --git a/frame/1m/packm/bli_packm_threading.c b/frame/1m/packm/bli_packm_threading.c index 0fa6b0bf2..098475c5e 100644 --- a/frame/1m/packm/bli_packm_threading.c +++ b/frame/1m/packm/bli_packm_threading.c @@ -34,13 +34,20 @@ #include "blis.h" +void bli_packm_thrinfo_free( packm_thrinfo_t* thread ) +{ + //Assume that the ocomm and the icomm are freed by something else and don't need to be freed. + bli_free(thread); +} + packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ) { return (packm_thrinfo_t*) bli_create_thread_info( ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); } -void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, +void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ) { bli_setup_thread_info( (thrinfo_t*) thread, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id ); diff --git a/frame/1m/packm/bli_packm_threading.h b/frame/1m/packm/bli_packm_threading.h index 0d6fce2e4..7b4dc0f22 100644 --- a/frame/1m/packm/bli_packm_threading.h +++ b/frame/1m/packm/bli_packm_threading.h @@ -46,6 +46,7 @@ typedef struct packm_thrinfo_s packm_thrinfo_t; #define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +void bli_packm_thrinfo_free( packm_thrinfo_t* thread ); packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ); void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 2211625a5..01b8eaab7 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -88,6 +88,6 @@ void bli_gemm_front( obj_t* alpha, (void*) cntl, (void**) infos ); - bli_gemm_thrinfo_free_paths( infos ); + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 047b083cf..2b2277b33 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -84,8 +84,29 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads ) +void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread) { + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); +// if( thread_am_ichief( thread ) ) +// bli_cleanup_communicator( thread->icomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( opackm ); + bli_packm_thrinfo_free( ipackm ); + bli_gemm_thrinfo_free( sub_gemm ); + bli_free( thread ); + + return; +} +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_gemm_thrinfo_free( threads[i] ); + bli_free( threads ); } gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index c3a708211..7848e1117 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -94,6 +94,6 @@ void bli_hemm_front( side_t side, (void*) cntl, (void**) infos ); - bli_gemm_thrinfo_free_paths( infos ); + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index b8329cf5b..01afc70dc 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -135,7 +135,7 @@ void bli_her2k_front( obj_t* alpha, (void*) cntl, (void**) infos ); - bli_herk_thrinfo_free_paths( infos ); + bli_herk_thrinfo_free_paths( infos, n_threads ); #endif } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 7d4fa4375..6ef80bad2 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range( thread, 0, m_trans, 8, &start, &end ); + bli_get_range_weighted( thread, 0, m_trans, 8, bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 5fcb56001..95215e906 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -91,7 +91,7 @@ void bli_herk_blk_var2f( obj_t* a, // Needs to be replaced with a weighted range because triangle //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, 8, bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 6139478ea..6fb092460 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -91,6 +91,6 @@ void bli_herk_front( obj_t* alpha, (void*) cntl, (void**) infos ); - bli_herk_thrinfo_free_paths( infos ); + bli_herk_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 2b291a924..091b74ff6 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -84,8 +84,27 @@ herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads ) +void bli_herk_thrinfo_free( herk_thrinfo_t* thread) { + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( opackm ); + bli_packm_thrinfo_free( ipackm ); + bli_herk_thrinfo_free( sub_herk ); + bli_free( thread ); + + return; +} +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_herk_thrinfo_free( threads[i] ); + bli_free( threads ); } herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index ed0c44664..796ad5196 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -93,6 +93,6 @@ void bli_symm_front( side_t side, (void*) cntl, (void**) infos ); - bli_gemm_thrinfo_free_paths( infos ); + bli_gemm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index f1ce3e279..eceaf1913 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -117,7 +117,7 @@ void bli_syr2k_front( obj_t* alpha, (void*) cntl, (void**) infos ); - bli_herk_thrinfo_free_paths( infos ); + bli_herk_thrinfo_free_paths( infos, n_threads ); #endif } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index c5ac22797..977a91cd8 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -87,6 +87,6 @@ void bli_syrk_front( obj_t* alpha, (void*) cntl, (void**) infos ); - bli_herk_thrinfo_free_paths( infos ); + bli_herk_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 644f27d4b..d8caba7dc 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -139,6 +139,6 @@ void bli_trmm_front( side_t side, (void*) cntl, (void**) infos ); - bli_trmm_thrinfo_free_paths( infos ); + bli_trmm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 3a6a7c0b4..144f4a64b 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -84,8 +84,28 @@ trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads ) +void bli_trmm_thrinfo_free( trmm_thrinfo_t* thread) { + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( opackm ); + bli_packm_thrinfo_free( ipackm ); + bli_trmm_thrinfo_free( sub_trmm ); + bli_free( thread ); + + return; +} + +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_trmm_thrinfo_free( threads[i] ); + bli_free( threads ); } trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 080b9a399..6f8757faa 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -141,6 +141,6 @@ void bli_trmm3_front( side_t side, (void*) cntl, (void**) infos ); - bli_trmm_thrinfo_free_paths( infos ); + bli_trmm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index e7cae7d51..2c42c24f9 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -139,6 +139,6 @@ void bli_trsm_front( side_t side, (void*) cntl, (void**) infos ); - bli_trsm_thrinfo_free_paths( infos ); + bli_trsm_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c index 08c915b15..139e090de 100644 --- a/frame/3/trsm/bli_trsm_threading.c +++ b/frame/3/trsm/bli_trsm_threading.c @@ -84,8 +84,27 @@ trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ return thread; } -void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads ) +void bli_trsm_thrinfo_free( trsm_thrinfo_t* thread) { + if( thread == NULL ) return; + + // Free Communicators + if( thread_am_ochief( thread ) ) + bli_free_communicator( thread->ocomm ); + + // Free Sub Thrinfos + bli_packm_thrinfo_free( opackm ); + bli_packm_thrinfo_free( ipackm ); + bli_trsm_thrinfo_free( sub_trsm ); + bli_free( thread ); + + return; +} +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num ) +{ + for( int i = 0; i < num; i++) + bli_trsm_thrinfo_free( threads[i] ); + bli_free( threads ); } trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 0b9ec30bd..df903af4f 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -39,6 +39,12 @@ void bli_cleanup_communicator( thread_comm_t* communicator ) if( communicator == NULL ) return; bli_destroy_lock( &communicator->barrier_lock ); } +void bli_free_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; + bli_cleanup_communicator( communicator ); + bli_free( communicator ); +} void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) { diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index f09da42c3..ca790192f 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -49,7 +49,9 @@ struct thread_comm_s typedef struct thread_comm_s thread_comm_t; void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads ); +void bli_cleanup_communicator( thread_comm_t* communicator ); thread_comm_t* bli_create_communicator( dim_t n_threads ); +void bli_free_communicator( thread_comm_t* communicator ); void* bli_broadcast_structure( thread_comm_t* communicator, dim_t inside_id, void* to_send ); From ab9c7880335c281432d5809fe0dec46753d22569 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 11:38:11 -0500 Subject: [PATCH 27/42] Added faster tree barriers necessary for performance for Xeon Phi Fixed up some stuff in the thread info free functions Disabled threading for TRSM so that it actually works when threading environment variables are set --- frame/3/gemm/bli_gemm_threading.c | 8 +- frame/3/gemm/bli_gemm_threading.h | 2 +- frame/3/herk/bli_herk_threading.c | 6 +- frame/3/herk/bli_herk_threading.h | 2 +- frame/3/trmm/bli_trmm_threading.c | 6 +- frame/3/trmm/bli_trmm_threading.h | 2 +- frame/3/trsm/bli_trsm_threading.c | 13 +- frame/3/trsm/bli_trsm_threading.h | 2 +- frame/base/bli_threading.c | 248 ++++++++++++++---------------- frame/base/bli_threading.h | 69 ++++++--- 10 files changed, 185 insertions(+), 173 deletions(-) diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 2b2277b33..7a81e8e04 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -91,13 +91,11 @@ void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread) // Free Communicators if( thread_am_ochief( thread ) ) bli_free_communicator( thread->ocomm ); -// if( thread_am_ichief( thread ) ) -// bli_cleanup_communicator( thread->icomm ); // Free Sub Thrinfos - bli_packm_thrinfo_free( opackm ); - bli_packm_thrinfo_free( ipackm ); - bli_gemm_thrinfo_free( sub_gemm ); + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_gemm_thrinfo_free( thread->sub_gemm ); bli_free( thread ); return; diff --git a/frame/3/gemm/bli_gemm_threading.h b/frame/3/gemm/bli_gemm_threading.h index 24bf6d734..b789ba09b 100644 --- a/frame/3/gemm/bli_gemm_threading.h +++ b/frame/3/gemm/bli_gemm_threading.h @@ -58,7 +58,7 @@ typedef struct gemm_thrinfo_s gemm_thrinfo_t; #define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ); -void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** ); +void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads ); void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 091b74ff6..7c5a6e141 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -93,9 +93,9 @@ void bli_herk_thrinfo_free( herk_thrinfo_t* thread) bli_free_communicator( thread->ocomm ); // Free Sub Thrinfos - bli_packm_thrinfo_free( opackm ); - bli_packm_thrinfo_free( ipackm ); - bli_herk_thrinfo_free( sub_herk ); + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_herk_thrinfo_free( thread->sub_herk ); bli_free( thread ); return; diff --git a/frame/3/herk/bli_herk_threading.h b/frame/3/herk/bli_herk_threading.h index d156547a8..33a04ff8b 100644 --- a/frame/3/herk/bli_herk_threading.h +++ b/frame/3/herk/bli_herk_threading.h @@ -59,7 +59,7 @@ typedef struct herk_thrinfo_s herk_thrinfo_t; herk_thrinfo_t** bli_create_herk_thrinfo_paths( ); -void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths ); +void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads ); void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 144f4a64b..ff9e6723c 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -93,9 +93,9 @@ void bli_trmm_thrinfo_free( trmm_thrinfo_t* thread) bli_free_communicator( thread->ocomm ); // Free Sub Thrinfos - bli_packm_thrinfo_free( opackm ); - bli_packm_thrinfo_free( ipackm ); - bli_trmm_thrinfo_free( sub_trmm ); + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_trmm_thrinfo_free( thread->sub_trmm ); bli_free( thread ); return; diff --git a/frame/3/trmm/bli_trmm_threading.h b/frame/3/trmm/bli_trmm_threading.h index 376608261..3b4ebd743 100644 --- a/frame/3/trmm/bli_trmm_threading.h +++ b/frame/3/trmm/bli_trmm_threading.h @@ -59,7 +59,7 @@ typedef struct trmm_thrinfo_s trmm_thrinfo_t; #define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ); -void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** ); +void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads ); void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c index 139e090de..8d62a737b 100644 --- a/frame/3/trsm/bli_trsm_threading.c +++ b/frame/3/trsm/bli_trsm_threading.c @@ -93,9 +93,9 @@ void bli_trsm_thrinfo_free( trsm_thrinfo_t* thread) bli_free_communicator( thread->ocomm ); // Free Sub Thrinfos - bli_packm_thrinfo_free( opackm ); - bli_packm_thrinfo_free( ipackm ); - bli_trsm_thrinfo_free( sub_trsm ); + bli_packm_thrinfo_free( thread->opackm ); + bli_packm_thrinfo_free( thread->ipackm ); + bli_trsm_thrinfo_free( thread->sub_trsm ); bli_free( thread ); return; @@ -109,11 +109,18 @@ void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num ) trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ) { + /* dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + */ + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/trsm/bli_trsm_threading.h b/frame/3/trsm/bli_trsm_threading.h index 30bc612bf..ad841331e 100644 --- a/frame/3/trsm/bli_trsm_threading.h +++ b/frame/3/trsm/bli_trsm_threading.h @@ -59,7 +59,7 @@ typedef struct trsm_thrinfo_s trsm_thrinfo_t; #define trsm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( ); -void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** ); +void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** info, dim_t n_threads ); void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index df903af4f..1efd53480 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -34,27 +34,140 @@ #include "blis.h" +#ifdef BLIS_TREE_BARRIER +barrier_t* bli_free_barrier_tree( barrier_t* barrier ) +{ + if( barrier == NULL ) + return; + barrier->count--; + bli_free_barrier_tree( barrier->dad ); + if( barrier->count == 1 ) + bli_free( barrier ); + return; +} +barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index) +{ + barrier_t* me = (barrier_t*) malloc(sizeof(barrier_t)); + + me->dad = NULL; + me->signal = 0; + + // Base Case + if( num_threads <= arity ) { + //Now must be registered as a leaf + for(int i = 0; i < num_threads; i++) + { + leaves[leaf_index + i] = me; + } + me->count = num_threads; + me->arity = num_threads; + } + else { + // Otherwise this node has children + int threads_per_kid = num_threads / arity; + int defecit = num_threads - threads_per_kid * arity; + + for(int i = 0; i < arity; i++){ + int threads_this_kid = threads_per_kid; + if(i < defecit) threads_this_kid++; + + barrier_t* kid = bli_create_tree_barrier(threads_this_kid, arity, leaves, leaf_index); + kid->dad = me; + + leaf_index += threads_this_kid; + } + me->count = arity; + me->arity = arity; + } + + return me; +} + void bli_cleanup_communicator( thread_comm_t* communicator ) { if( communicator == NULL ) return; - bli_destroy_lock( &communicator->barrier_lock ); + for( dim_t i = 0; i < communicator->n_threads; i++) + { + bli_free_barrier_tree( communicator->barriers[i] ); + } + bli_free( communicator->barriers ); } -void bli_free_communicator( thread_comm_t* communicator ) +void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) { if( communicator == NULL ) return; - bli_cleanup_communicator( communicator ); - bli_free( communicator ); + communicator->sent_object = NULL; + communicator->n_threads = n_threads; + communicator->barriers = ( barrier_t** ) bli_malloc( sizeof( barrier_t* ) * n_threads ); + bli_create_tree_barrier( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 ); } +void tree_barrier( barrier_t* barack ) +{ + int my_signal = barack->signal; + int my_count; + + _Pragma("omp atomic capture") + my_count = barack->count--; + + if( my_count == 1 ) { + if( barack->dad != NULL ) { + tree_barrier( barack->dad ); + } + barack->count = barack->arity; + barack->signal = !barack->signal; + } + else { + volatile int* listener = &barack->signal; + while( *listener == my_signal ) {} + } +} + +void bli_barrier( thread_comm_t* comm, dim_t t_id ) +{ + tree_barrier( comm->barriers[t_id] ); +} + +#else +void bli_cleanup_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; +} void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) { if( communicator == NULL ) return; communicator->sent_object = NULL; communicator->n_threads = n_threads; communicator->barrier_sense = 0; - bli_init_lock( &communicator->barrier_lock ); communicator->barrier_threads_arrived = 0; } +//barrier routine taken from art of multicore programming or something +void bli_barrier( thread_comm_t* communicator, dim_t t_id ) +{ + if(communicator == NULL || communicator->n_threads == 1) + return; + bool_t my_sense = communicator->barrier_sense; + dim_t my_threads_arrived; + + _Pragma("omp atomic capture") + my_threads_arrived = ++(communicator->barrier_threads_arrived); + + if( my_threads_arrived == communicator->n_threads ) { + communicator->barrier_threads_arrived = 0; + communicator->barrier_sense = !communicator->barrier_sense; + } + else { + volatile bool_t* listener = &communicator->barrier_sense; + while( *listener == my_sense ) {} + } +} +#endif + +void bli_free_communicator( thread_comm_t* communicator ) +{ + if( communicator == NULL ) return; + bli_cleanup_communicator( communicator ); + bli_free( communicator ); +} thread_comm_t* bli_create_communicator( dim_t n_threads ) { @@ -76,85 +189,6 @@ void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_s return object; } -void bli_init_lock( lock_t* lock ) -{ - omp_init_lock( lock ); -} -void bli_destroy_lock( lock_t* lock ) -{ - omp_destroy_lock( lock ); -} -void bli_set_lock( lock_t* lock ) -{ - omp_set_lock( lock ); -} -void bli_unset_lock( lock_t* lock ) -{ - omp_unset_lock( lock ); -} - -//barrier routine taken from art of multicore programming or something -void bli_barrier( thread_comm_t* communicator, dim_t t_id ) -{ - if(communicator == NULL || communicator->n_threads == 1) - return; - bool_t my_sense = communicator->barrier_sense; - dim_t my_threads_arrived; - - _Pragma("omp atomic capture") - my_threads_arrived = ++(communicator->barrier_threads_arrived); - -/* - bli_set_lock(&communicator->barrier_lock); - my_threads_arrived = communicator->barrier_threads_arrived + 1; - communicator->barrier_threads_arrived = my_threads_arrived; - bli_unset_lock(&communicator->barrier_lock); -*/ - - if( my_threads_arrived == communicator->n_threads ) { - - bli_set_lock(&communicator->barrier_lock); - communicator->barrier_threads_arrived = 0; - communicator->barrier_sense = !communicator->barrier_sense; - bli_unset_lock(&communicator->barrier_lock); - } - else { - volatile bool_t* listener = &communicator->barrier_sense; - while( *listener == my_sense ) {} - } -} -/* -//Recursively create thread communicators -void create_comms( dim_t* caucuses_at_level, dim_t n_levels, dim_t cur_level, - thread_comm_tree_t* parent, thread_comm_tree_t* leaves, dim_t global_id ) -{ - //Create a communicator - dim_t n_threads = 1; - for( dim_t i = cur_level; i < n_levels; i++) - n_threads *= caucuses_at_level[i]; - - - thread_comm_t* comm = bli_create_communicator( n_threads ); - thread_comm_tree_t* info; - if( cur_level == n_levels ) - { - leaves[global_id].parent = parent; - leaves[global_id].comm = comm; - return; - } - else - { - info = (thread_comm_tree_t*)bli_malloc(sizeof(thread_comm_tree_t)); - info->comm = comm; - info->parent = parent; - } - - //Now create child communicators - dim_t caucuses = caucuses_at_level[cur_level]; - for( dim_t i = 0; i < caucuses; i++) - create_comms( caucuses_at_level, n_levels, cur_level+1, info, leaves, global_id * caucuses + i); -} -*/ thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, dim_t n_way, dim_t work_id ) { @@ -176,52 +210,6 @@ void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id thr->work_id = work_id; } -/* -thrinfo_t* bli_create_thread_info( dim_t* caucuses_at_level, dim_t n_levels ) -{ - //Calculate total number of threads - dim_t n_threads = 1; - for( dim_t i = 0; i < n_levels; i++) - n_threads *= caucuses_at_level[i]; - - //Create communicators - thread_comm_tree_t* comm_leaves = (thread_comm_tree_t*)bli_malloc( sizeof(thread_comm_tree_t) * n_threads); - create_comms( caucuses_at_level, n_levels, 0, NULL, comm_leaves, 0 ); - thrinfo_t* info_paths = (thrinfo_t*)bli_malloc( sizeof(thrinfo_t) * n_threads ); - - //Now create paths upwards - for( dim_t i = 0; i < n_threads; i++ ) - { - thread_comm_tree_t* comm_node = &comm_leaves[i]; - - //Setup thread info for the bottom-most level - thrinfo_t* bot = &BLIS_SINGLE_THREADED; //bli_create_thrinfo_t( comm_node->comm, 0, NULL, 1, 0 ); - - //Now build thread infos upwards - comm_node = comm_node->parent; - thrinfo_t* cur; - thrinfo_t* prev = bot; - for( dim_t j = 0; j < n_levels; j++ ) - { - if( j == n_levels - 1 ) - cur = &info_paths[i]; - else - cur = (thrinfo_t*)bli_malloc(sizeof(thrinfo_t)); - - dim_t caucus_size = prev->ocomm->n_threads; - dim_t ocomm_id = i % comm_node->comm->n_threads; - dim_t caucus_id = ocomm_id / caucus_size; - - bli_setup_thrinfo_t(cur, comm_node->comm, ocomm_id, - prev, caucuses_at_level[n_levels - j - 1], caucus_id ); - - prev = cur; - comm_node = comm_node->parent; - } - } - return info_paths; -} -*/ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ) { thrinfo_t* thread = (thrinfo_t*) thr; diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index ca790192f..0ca6fdf4f 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -34,32 +34,44 @@ #ifndef BLIS_THREADING_H #define BLIS_THREADING_H +#define BLIS_TREE_BARRIER +#define BLIS_TREE_BARRIER_ARITY 4 -typedef omp_lock_t lock_t; +#ifdef BLIS_TREE_BARRIER + struct barrier_s + { + int arity; + int count; + struct barrier_s* dad; + int signal; + }; + typedef struct barrier_s barrier_t; -struct thread_comm_s -{ - void* sent_object; - dim_t n_threads; + struct thread_comm_s + { + void* sent_object; + dim_t n_threads; + barrier_t** barriers; + }; +#else + struct thread_comm_s + { + void* sent_object; + dim_t n_threads; - bool_t barrier_sense; - lock_t barrier_lock; - dim_t barrier_threads_arrived; -}; + bool_t barrier_sense; + dim_t barrier_threads_arrived; + }; +#endif typedef struct thread_comm_s thread_comm_t; +// Thread Communicator Interface Definitions void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads ); void bli_cleanup_communicator( thread_comm_t* communicator ); thread_comm_t* bli_create_communicator( dim_t n_threads ); void bli_free_communicator( thread_comm_t* communicator ); - void* bli_broadcast_structure( thread_comm_t* communicator, dim_t inside_id, void* to_send ); - void bli_barrier( thread_comm_t* communicator, dim_t thread_id ); -void bli_set_lock( lock_t* lock ); -void bli_unset_lock( lock_t* lock ); -void bli_init_lock( lock_t* lock ); -void bli_destroy_lock( lock_t* lock ); struct thrinfo_s { @@ -73,14 +85,15 @@ struct thrinfo_s }; typedef struct thrinfo_s thrinfo_t; -#define thread_ocomm( thread ) thread->ocomm +// Thread Info Interface Definitions +#define thread_ocomm( thread ) (thread->ocomm) #define thread_icomm( thread ) (thread->icomm) -#define thread_id( thread ) thread->ocomm_id -#define thread_num_threads( thread ) thread->ocomm->n_threads +#define thread_id( thread ) (thread->ocomm_id) +#define thread_num_threads( thread ) (thread->ocomm->n_threads) -#define thread_work_id( thread ) thread->work_id -#define thread_n_way( thread ) thread->n_way +#define thread_work_id( thread ) (thread->work_id) +#define thread_n_way( thread ) (thread->n_way) #define thread_am_ochief( thread ) (thread->ocomm_id == 0) #define thread_am_ichief( thread ) (thread->icomm_id == 0) @@ -91,15 +104,21 @@ typedef struct thrinfo_s thrinfo_t; void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ); void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end); - -thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, - dim_t n_way, dim_t work_id ); -void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id, - dim_t n_way, dim_t work_id ); +thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); +void bli_setup_thread_info( thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, + thread_comm_t* icomm, dim_t icomm_id, + dim_t n_way, dim_t work_id ); dim_t bli_read_nway_from_env( char* env ); + //void bli_setup_single_threaded_info( thrinfo_t* thr, thread_comm_t* comm ); //thrinfo_t* bli_create_thread_info( dim_t* n_threads_each_level, dim_t n_levels ); + +//TODO: These nneed to be included after the thread info and thread comm definitions +// But this doesn't seem like the best place to put these includes. +// Note that the bli_packm_threading.h must be included before the others! #include "bli_packm_threading.h" #include "bli_gemm_threading.h" #include "bli_herk_threading.h" From 575fb9b0b08f3bdb56ccde056da619d1585617c1 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 12:13:29 -0500 Subject: [PATCH 28/42] Changed default blocking factor to default double precision MR and NR --- frame/3/gemm/bli_gemm_blk_var1f.c | 2 +- frame/3/gemm/bli_gemm_blk_var2f.c | 2 +- frame/3/herk/bli_herk_blk_var1f.c | 2 +- frame/3/herk/bli_herk_blk_var2f.c | 2 +- frame/3/trmm/bli_trmm_blk_var1f.c | 3 +-- frame/3/trmm/bli_trmm_blk_var2b.c | 2 +- frame/3/trmm/bli_trmm_blk_var2f.c | 2 +- frame/3/trsm/bli_trsm_blk_var1b.c | 2 +- frame/3/trsm/bli_trsm_blk_var1f.c | 2 +- frame/3/trsm/bli_trsm_blk_var2b.c | 2 +- frame/3/trsm/bli_trsm_blk_var2f.c | 2 +- 11 files changed, 11 insertions(+), 12 deletions(-) diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 2ba71c536..29c4670af 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; - bli_get_range( thread, 0, m_trans, 8, &start, &end ); + bli_get_range( thread, 0, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index cab440c55..dd8a073d4 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, 8, &start, &end ); + bli_get_range( thread, 0, n_trans, BLIS_DEFAULT_NC_D, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 6ef80bad2..880a06110 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range_weighted( thread, 0, m_trans, 8, bli_obj_is_upper( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, m_trans, BLIS_DEFAULT_MC_D, bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 95215e906..45b4d423a 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -91,7 +91,7 @@ void bli_herk_blk_var2f( obj_t* a, // Needs to be replaced with a weighted range because triangle //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, 8, bli_obj_is_lower( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index e71ec01b6..4d4e87ade 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -94,8 +94,7 @@ void bli_trmm_blk_var1f( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, 8, &start, &end ); - + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) { diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index dae4f04e7..18c580fa9 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2b( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 0, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index d0959fc6e..68cd11033 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2f( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 66b3e9fc7..fd34d3ec4 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -82,7 +82,7 @@ void bli_trsm_blk_var1b( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, 8, &start, &end ); + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index 0525db3be..53868058d 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -81,7 +81,7 @@ void bli_trsm_blk_var1f( obj_t* a, offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); dim_t start, end; - bli_get_range( thread, offA, m_trans, 8, &start, &end ); + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index eadda1c37..d8f29513a 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -83,7 +83,7 @@ void bli_trsm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range_weighted( thread, 0, n_trans, 8, 0, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 0, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index e81875a4d..038e035f9 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -84,7 +84,7 @@ void bli_trsm_blk_var2f( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, 8, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) From 5ec93bd9a76096312d51c326ccde1e9bd0a436ab Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 15:09:10 -0500 Subject: [PATCH 29/42] Bunch of minor fixes Removed barrier after unpackm in all level3 blocked variants Now there is an implicit barrier inside unpackm that only occurs if C is packed (which is usually not the case) Moved the enabling of the tree barriers into bli_config.h Fed the default MR and NR for double precision into bli_get_range instead of the number 8 --- config/mic/bli_config.h | 3 +++ frame/1m/packm/bli_packm_unb_var1.c | 33 ++++++++++++++++------------- frame/1m/packm/bli_packm_unb_var1.h | 3 ++- frame/1m/unpackm/bli_unpackm_int.c | 12 +++++++---- frame/1m/unpackm/bli_unpackm_int.h | 3 ++- frame/2/ger/bli_ger_blk_var1.c | 3 ++- frame/2/ger/bli_ger_blk_var2.c | 3 ++- frame/2/her/bli_her_blk_var1.c | 3 ++- frame/2/her/bli_her_blk_var2.c | 3 ++- frame/2/her2/bli_her2_blk_var1.c | 3 ++- frame/2/her2/bli_her2_blk_var2.c | 3 ++- frame/2/her2/bli_her2_blk_var3.c | 3 ++- frame/2/her2/bli_her2_blk_var4.c | 3 ++- frame/3/gemm/bli_gemm_blk_var1f.c | 10 +++------ frame/3/gemm/bli_gemm_blk_var2f.c | 10 +++------ frame/3/gemm/bli_gemm_blk_var3f.c | 10 ++++----- frame/3/herk/bli_herk_blk_var1f.c | 9 +++----- frame/3/herk/bli_herk_blk_var2f.c | 8 +++---- frame/3/herk/bli_herk_blk_var3f.c | 11 +++++----- frame/3/trmm/bli_trmm_blk_var1f.c | 11 +++------- frame/3/trmm/bli_trmm_blk_var2b.c | 13 ++++-------- frame/3/trmm/bli_trmm_blk_var2f.c | 13 ++++-------- frame/3/trmm/bli_trmm_blk_var3b.c | 11 +++++----- frame/3/trmm/bli_trmm_blk_var3f.c | 11 +++++----- frame/3/trsm/bli_trsm_blk_var2b.c | 11 +++------- frame/3/trsm/bli_trsm_blk_var2f.c | 11 +++------- frame/3/trsm/bli_trsm_blk_var3b.c | 11 +++++----- frame/3/trsm/bli_trsm_blk_var3f.c | 11 +++++----- frame/base/bli_threading.c | 2 +- frame/base/bli_threading.h | 3 --- 30 files changed, 114 insertions(+), 130 deletions(-) diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index 637e71f74..688ed75ec 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -36,6 +36,9 @@ #define BLIS_CONFIG_H +#define BLIS_TREE_BARRIER +#define BLIS_TREE_BARRIER_ARITY 4 + // -- OPERATING SYSTEM --------------------------------------------------------- diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index c7d85a78a..b344b93d6 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -56,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); void bli_packm_unb_var1( obj_t* c, - obj_t* p ) + obj_t* p, + packm_thrinfo_t* thread ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -98,20 +99,22 @@ void bli_packm_unb_var1( obj_t* c, // function pointer. f = ftypes[dt_cp]; - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - densify, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p ); + if( thread_am_ochief( thread ) ) { + // Invoke the function. + f( strucc, + diagoffc, + diagc, + uploc, + transc, + densify, + m_p, + n_p, + m_max_p, + n_max_p, + buf_kappa, + buf_c, rs_c, cs_c, + buf_p, rs_p, cs_p ); + } } diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h index 1f4c451bf..25e95994e 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_unb_var1.h @@ -33,7 +33,8 @@ */ void bli_packm_unb_var1( obj_t* c, - obj_t* p ); + obj_t* p, + packm_thrinfo_t* thread ); #undef GENTPROT diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index f94bbb423..70a520403 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -49,7 +49,8 @@ static FUNCPTR_T vars[2][3] = void bli_unpackm_int( obj_t* p, obj_t* a, - unpackm_t* cntl ) + unpackm_t* cntl, + packm_thrinfo_t* thread ) { // The unpackm operation consists of an optional post-process: castm. // (This post-process is analogous to the castm pre-process in packm.) @@ -122,9 +123,12 @@ void bli_unpackm_int( obj_t* p, f = vars[n][i]; // Invoke the variant. - f( p, - &c, - cntl ); + if( thread_am_ochief( thread ) ) { + f( p, + &c, + cntl ); + } + thread_obarrier( thread ); // Now, if necessary, we cast the contents of c to matrix a. If casting // was not necessary, then we are done because the call to the unpackm diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 11960817c..89b8489f6 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -34,7 +34,8 @@ void bli_unpackm_int( obj_t* p, obj_t* a, - unpackm_t* cntl ); + unpackm_t* cntl, + packm_thrinfo_t* thread ); /* void bli_unpackm_init_cast( obj_t* p, diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/bli_ger_blk_var1.c index e22c69fd9..77b6ace11 100644 --- a/frame/2/ger/bli_ger_blk_var1.c +++ b/frame/2/ger/bli_ger_blk_var1.c @@ -91,7 +91,8 @@ void bli_ger_blk_var1( obj_t* alpha, // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, - cntl_sub_unpackm_a( cntl ) ); + cntl_sub_unpackm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/bli_ger_blk_var2.c index a1a5eeb45..6405497aa 100644 --- a/frame/2/ger/bli_ger_blk_var2.c +++ b/frame/2/ger/bli_ger_blk_var2.c @@ -91,7 +91,8 @@ void bli_ger_blk_var2( obj_t* alpha, // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, - cntl_sub_unpackm_a( cntl ) ); + cntl_sub_unpackm_a( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/bli_her_blk_var1.c index 7121ff0b1..45fc9c1d4 100644 --- a/frame/2/her/bli_her_blk_var1.c +++ b/frame/2/her/bli_her_blk_var1.c @@ -113,7 +113,8 @@ void bli_her_blk_var1( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/bli_her_blk_var2.c index b9bf2154c..a856269b0 100644 --- a/frame/2/her/bli_her_blk_var2.c +++ b/frame/2/her/bli_her_blk_var2.c @@ -113,7 +113,8 @@ void bli_her_blk_var2( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/bli_her2_blk_var1.c index 645b9de79..af15b674f 100644 --- a/frame/2/her2/bli_her2_blk_var1.c +++ b/frame/2/her2/bli_her2_blk_var1.c @@ -137,7 +137,8 @@ void bli_her2_blk_var1( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/bli_her2_blk_var2.c index d6876de3e..d57da2bff 100644 --- a/frame/2/her2/bli_her2_blk_var2.c +++ b/frame/2/her2/bli_her2_blk_var2.c @@ -140,7 +140,8 @@ void bli_her2_blk_var2( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/bli_her2_blk_var3.c index 7e84b5830..8270f8dff 100644 --- a/frame/2/her2/bli_her2_blk_var3.c +++ b/frame/2/her2/bli_her2_blk_var3.c @@ -140,7 +140,8 @@ void bli_her2_blk_var3( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/bli_her2_blk_var4.c index 4760606f9..77b750230 100644 --- a/frame/2/her2/bli_her2_blk_var4.c +++ b/frame/2/her2/bli_her2_blk_var4.c @@ -137,7 +137,8 @@ void bli_her2_blk_var4( conj_t conjh, // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, - cntl_sub_unpackm_c11( cntl ) ); + cntl_sub_unpackm_c11( cntl ), + &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index 29c4670af..a1b93eb1a 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -131,13 +131,9 @@ void bli_gemm_blk_var1f( obj_t* a, // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index dd8a073d4..61ea352b9 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -130,13 +130,9 @@ void bli_gemm_blk_var2f( obj_t* a, // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 3f723d43c..f1114daaf 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -139,14 +139,14 @@ void bli_gemm_blk_var3f( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ){ - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + gemm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ) + bli_obj_release_pack( c_pack ); if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 880a06110..fa184e2b0 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -126,12 +126,9 @@ void bli_herk_blk_var1f( obj_t* a, herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 45b4d423a..8496b0852 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -142,11 +142,9 @@ void bli_herk_blk_var2f( obj_t* a, herk_thread_sub_herk( thread ) ); // Unpack C1 (if C1 was packed). - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1S_pack, &c1S, - cntl_sub_unpackm_c( cntl ) ); - } - thread_ibarrier( thread ); + bli_unpackm_int( c1S_pack, &c1S, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 96e9da471..943109156 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -136,14 +136,15 @@ void bli_herk_blk_var3f( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ) { - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + herk_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } if( thread_am_ichief( thread ) ) { bli_obj_release_pack( a1_pack ); bli_obj_release_pack( ah1_pack ); diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 4d4e87ade..fff6cc7fc 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -137,14 +137,9 @@ void bli_trmm_blk_var1f( obj_t* a, trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 18c580fa9..25f07d031 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2b( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 0, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) @@ -127,14 +127,9 @@ void bli_trmm_blk_var2b( obj_t* a, trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 68cd11033..0077ea9cb 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2f( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) @@ -127,14 +127,9 @@ void bli_trmm_blk_var2f( obj_t* a, trmm_thread_sub_trmm( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index f2ccd38a6..6a1191936 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -126,14 +126,15 @@ void bli_trmm_blk_var3b( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ){ - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ){ + bli_obj_release_pack( c_pack ); + } if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index c361d6b23..67a4aa880 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -126,14 +126,15 @@ void bli_trmm_blk_var3f( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ){ - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trmm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ){ + bli_obj_release_pack( c_pack ); + } if( thread_am_ichief( thread ) ){ bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index d8f29513a..c4ad6e7bd 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -127,14 +127,9 @@ void bli_trsm_blk_var2b( obj_t* a, trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 038e035f9..54e165029 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -128,14 +128,9 @@ void bli_trsm_blk_var2f( obj_t* a, trsm_thread_sub_trsm( thread ) ); // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - if( thread_am_ichief( thread ) ) { - bli_unpackm_int( c1_pack, &c1, - cntl_sub_unpackm_c( cntl ) ); - } - //Barrier to make sure unpacking is done before next iteration's packing of C - //Somehow, we'd like to make this a noop if packing isn't done. - thread_ibarrier( thread ); + bli_unpackm_int( c1_pack, &c1, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index b43f9f0f8..dd6b2c0c7 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -137,14 +137,15 @@ void bli_trsm_blk_var3b( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ) { - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } if( thread_am_ichief( thread ) ) { bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 84ad3ed16..466fd4461 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -137,14 +137,15 @@ void bli_trsm_blk_var3f( obj_t* a, thread_obarrier( thread ); // Unpack C (if C was packed). - if( thread_am_ochief( thread ) ) { - bli_unpackm_int( c_pack, c, - cntl_sub_unpackm_c( cntl ) ); - bli_obj_release_pack( c_pack ); - } + bli_unpackm_int( c_pack, c, + cntl_sub_unpackm_c( cntl ), + trsm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. + if( thread_am_ochief( thread ) ) { + bli_obj_release_pack( c_pack ); + } if( thread_am_ichief( thread ) ) { bli_obj_release_pack( a1_pack ); bli_obj_release_pack( b1_pack ); diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 1efd53480..4f77d75f5 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -35,7 +35,7 @@ #include "blis.h" #ifdef BLIS_TREE_BARRIER -barrier_t* bli_free_barrier_tree( barrier_t* barrier ) +void bli_free_barrier_tree( barrier_t* barrier ) { if( barrier == NULL ) return; diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h index 0ca6fdf4f..aa8bd8152 100644 --- a/frame/base/bli_threading.h +++ b/frame/base/bli_threading.h @@ -34,9 +34,6 @@ #ifndef BLIS_THREADING_H #define BLIS_THREADING_H -#define BLIS_TREE_BARRIER -#define BLIS_TREE_BARRIER_ARITY 4 - #ifdef BLIS_TREE_BARRIER struct barrier_s { From 7b9b228c6fa4cfb70b1ebb855b009a036e85fac3 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 16:29:10 -0500 Subject: [PATCH 30/42] Fix for tree barrier freeing bug --- frame/base/bli_threading.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 4f77d75f5..b2c9ae29c 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -40,9 +40,11 @@ void bli_free_barrier_tree( barrier_t* barrier ) if( barrier == NULL ) return; barrier->count--; - bli_free_barrier_tree( barrier->dad ); - if( barrier->count == 1 ) + if( barrier->count == 0 ) + { + bli_free_barrier_tree( barrier->dad ); bli_free( barrier ); + } return; } barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index) From e7ca9e4b4a24d585c9aec8293fc7bb79e4171ad0 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 16:31:15 -0500 Subject: [PATCH 31/42] Used BLIS_DEFAULT_*_MR for rounding partitioning instead of BLIS_DEFAULT_*_MC --- frame/3/gemm/bli_gemm_blk_var1f.c | 2 +- frame/3/gemm/bli_gemm_blk_var2f.c | 2 +- frame/3/herk/bli_herk_blk_var1f.c | 2 +- frame/3/herk/bli_herk_blk_var2f.c | 2 +- frame/3/trmm/bli_trmm_blk_var1f.c | 2 +- frame/3/trmm/bli_trmm_blk_var2b.c | 2 +- frame/3/trmm/bli_trmm_blk_var2f.c | 2 +- frame/3/trsm/bli_trsm_blk_var1b.c | 2 +- frame/3/trsm/bli_trsm_blk_var1f.c | 2 +- frame/3/trsm/bli_trsm_blk_var2b.c | 2 +- frame/3/trsm/bli_trsm_blk_var2f.c | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index a1b93eb1a..bba9c3290 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -83,7 +83,7 @@ void bli_gemm_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; - bli_get_range( thread, 0, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); + bli_get_range( thread, 0, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index 61ea352b9..71b190068 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -82,7 +82,7 @@ void bli_gemm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, BLIS_DEFAULT_NC_D, &start, &end ); + bli_get_range( thread, 0, n_trans, BLIS_DEFAULT_NR_D, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index fa184e2b0..5d5825087 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,7 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range_weighted( thread, 0, m_trans, BLIS_DEFAULT_MC_D, bli_obj_is_upper( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, m_trans, BLIS_DEFAULT_MR_D, bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 8496b0852..c860de1d6 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -91,7 +91,7 @@ void bli_herk_blk_var2f( obj_t* a, // Needs to be replaced with a weighted range because triangle //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_lower( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index fff6cc7fc..472cfc965 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -94,7 +94,7 @@ void bli_trmm_blk_var1f( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) { diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 25f07d031..6281c6e2d 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2b( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_upper( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index 0077ea9cb..aabe9f251 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -83,7 +83,7 @@ void bli_trmm_blk_var2f( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, bli_obj_is_lower( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index fd34d3ec4..f73999b1f 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -82,7 +82,7 @@ void bli_trsm_blk_var1b( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index 53868058d..e341341c9 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -81,7 +81,7 @@ void bli_trsm_blk_var1f( obj_t* a, offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MC_D, &start, &end ); + bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index c4ad6e7bd..9b2dc3b41 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -83,7 +83,7 @@ void bli_trsm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 0, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 0, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 54e165029..85af8212d 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -84,7 +84,7 @@ void bli_trsm_blk_var2f( obj_t* a, n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NC_D, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 1, &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) From c332be8cd471eeace7b4fa4ae7443088b6a68ec3 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 16:37:50 -0500 Subject: [PATCH 32/42] Added -openmp flag to Xeon Phi build for convenience --- config/mic/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index c894bc638..c43222f6e 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate # --- Determine the C compiler and related flags --- CC := icc CPPROCFLAGS := -CMISCFLAGS := -mmic -fasm-blocks -std=c99 +CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp CDBGFLAGS := CWARNFLAGS := -Wall COPTFLAGS := -O3 From bde697f75ec1e7f2decebee0c9bd620b4c134cd5 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 4 Apr 2014 16:43:44 -0500 Subject: [PATCH 33/42] Add -openmp to ldflags as well --- config/mic/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index c43222f6e..b09bea493 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -98,7 +98,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -mmic -lm +LDFLAGS := -mmic -lm -openmp From 20e24430a772bc0fbaf24dec2f8c544096fd3f4e Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 8 Apr 2014 17:50:44 +0000 Subject: [PATCH 34/42] Some fixes for the bgq kernels --- kernels/bgq/1/bli_axpyv_opt_var1.c | 1 - kernels/bgq/1/bli_dotv_opt_var1.c | 4 ++-- kernels/bgq/3/bli_gemm_8x8.c | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.c b/kernels/bgq/1/bli_axpyv_opt_var1.c index 7e8e21115..8f389091b 100644 --- a/kernels/bgq/1/bli_axpyv_opt_var1.c +++ b/kernels/bgq/1/bli_axpyv_opt_var1.c @@ -75,4 +75,3 @@ void bli_daxpyv_opt_var1( y[4*n_run + i] += *alpha * x[4*n_run + i]; } } -} diff --git a/kernels/bgq/1/bli_dotv_opt_var1.c b/kernels/bgq/1/bli_dotv_opt_var1.c index 26b7753e7..11bf4741d 100644 --- a/kernels/bgq/1/bli_dotv_opt_var1.c +++ b/kernels/bgq/1/bli_dotv_opt_var1.c @@ -85,11 +85,11 @@ void bli_ddotv_opt_var1( rhos += vec_extract( rhov, 2 ); rhos += vec_extract( rhov, 3 ); } - for ( dim_t i = n_left; i < n_left; i++ ) + for ( dim_t i = 0; i < n_left; i++ ) { rhos += x[4*n_run + i] * y[4*n_run + i]; } *rho = rhos; -}} +} diff --git a/kernels/bgq/3/bli_gemm_8x8.c b/kernels/bgq/3/bli_gemm_8x8.c index 2b8905553..e2fe3f8d2 100644 --- a/kernels/bgq/3/bli_gemm_8x8.c +++ b/kernels/bgq/3/bli_gemm_8x8.c @@ -226,7 +226,7 @@ void bli_zgemm_8x8( double * a = (double*) a_z; double * b = (double*) b_z; double * c = (double*) c_z; - + //Registers for storing C. //2 2x4 subblocks of C, c0, and c1 //Each sub-block has 4 columns, 0, 1, 2, 3 @@ -253,7 +253,6 @@ void bli_zgemm_8x8( vector4double b0, b1, b2, b3; vector4double a0, a1; - double _Complex tmp = 0.0; for( dim_t i = 0; i < k; i++ ) { @@ -334,13 +333,13 @@ void bli_zgemm_8x8( \ /* Scale by alpha */ \ REG1 = vec_xmadd( alphav, AB, zed ); \ - REG2 = vec_xxcpnmadd( AB, alphav, zed ); \ + REG2 = vec_xxcpnmadd( AB, alphav, zed ); \ AB = vec_sub(REG1, REG2 ); \ \ \ /* Scale by beta */ \ REG1 = vec_xmadd( betav, C, zed ); \ - REG2 = vec_xxcpnmadd( C, betav, zed ); \ + REG2 = vec_xxcpnmadd( C, betav, zed ); \ C = vec_sub(REG1, REG2 ); \ \ /* Add AB to C */ \ From 456df0372170bd7ca2c7e2d85365a69f1f04de88 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 30 Apr 2014 12:28:00 -0500 Subject: [PATCH 35/42] Replaced register blocksize hack with querying the register blocksize for determining parallelism granularity --- frame/3/gemm/bli_gemm_blk_var1f.c | 4 +++- frame/3/gemm/bli_gemm_blk_var2f.c | 4 +++- frame/3/herk/bli_herk_blk_var1f.c | 4 +++- frame/3/herk/bli_herk_blk_var2f.c | 5 +++-- frame/3/trmm/bli_trmm_blk_var1f.c | 5 ++++- frame/3/trmm/bli_trmm_blk_var2b.c | 5 +++-- frame/3/trmm/bli_trmm_blk_var2f.c | 5 +++-- frame/3/trmm/bli_trmm_rl_ker_var2.c | 4 ++-- frame/3/trmm/bli_trmm_ru_ker_var2.c | 4 ++-- frame/3/trsm/bli_trsm_blk_var1b.c | 4 +++- frame/3/trsm/bli_trsm_blk_var1f.c | 4 +++- frame/3/trsm/bli_trsm_blk_var2b.c | 4 +++- frame/3/trsm/bli_trsm_blk_var2f.c | 5 +++-- 13 files changed, 38 insertions(+), 19 deletions(-) diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index bba9c3290..8288f3ac9 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -83,7 +83,9 @@ void bli_gemm_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; - bli_get_range( thread, 0, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); + bli_get_range( thread, 0, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index 71b190068..63c2f5824 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -82,7 +82,9 @@ void bli_gemm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range( thread, 0, n_trans, BLIS_DEFAULT_NR_D, &start, &end ); + bli_get_range( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index 5d5825087..fbee3a750 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -82,7 +82,9 @@ void bli_herk_blk_var1f( obj_t* a, // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *c ); dim_t start, end; - bli_get_range_weighted( thread, 0, m_trans, BLIS_DEFAULT_MR_D, bli_obj_is_upper( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index c860de1d6..f8fc666ba 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -90,8 +90,9 @@ void bli_herk_blk_var2f( obj_t* a, dim_t start, end; // Needs to be replaced with a weighted range because triangle - //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 472cfc965..c6cd75421 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -94,7 +94,10 @@ void bli_trmm_blk_var1f( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); + bli_get_range_weighted( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); + // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) { diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 6281c6e2d..64b33f310 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -82,8 +82,9 @@ void bli_trmm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_upper( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index aabe9f251..8adaf2b57 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -82,8 +82,9 @@ void bli_trmm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 296325ec8..133c0d8ed 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -190,7 +190,7 @@ void PASTEMAC(ch,varname)( \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ - inc_t cstep_b; \ + /*inc_t cstep_b; */\ inc_t rstep_c, cstep_c; \ inc_t ss_b; \ auxinfo_t aux; \ @@ -271,7 +271,7 @@ void PASTEMAC(ch,varname)( \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ - cstep_b = ps_b; \ + /*cstep_b = ps_b; */\ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 7f13e47a8..cb5ef580f 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -190,7 +190,7 @@ void PASTEMAC(ch,varname)( \ dim_t off_b0111; \ dim_t i, j; \ inc_t rstep_a; \ - inc_t cstep_b; \ + /*inc_t cstep_b; */\ inc_t rstep_c, cstep_c; \ inc_t ss_b; \ auxinfo_t aux; \ @@ -272,7 +272,7 @@ void PASTEMAC(ch,varname)( \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ - cstep_b = ps_b; \ + /*cstep_b = ps_b; */\ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index f73999b1f..d2037c202 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -82,7 +82,9 @@ void bli_trsm_blk_var1b( obj_t* a, bli_obj_width_after_trans( *a ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); + bli_get_range_weighted( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index e341341c9..7072d0438 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -81,7 +81,9 @@ void bli_trsm_blk_var1f( obj_t* a, offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); dim_t start, end; - bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end ); + bli_get_range( thread, offA, m_trans, + bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ), + &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 9b2dc3b41..2ee269cee 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -83,7 +83,9 @@ void bli_trsm_blk_var2b( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 0, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_upper( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 85af8212d..41ccc668f 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -83,8 +83,9 @@ void bli_trsm_blk_var2f( obj_t* a, // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; - //bli_get_range( thread, 0, n_trans, 8, &start, &end ); - bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 1, &start, &end ); + bli_get_range_weighted( thread, 0, n_trans, + bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ), + bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) From bd1dc98ce599d74513a553fe3b37a2ebca1c3812 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 12 May 2014 17:26:19 -0500 Subject: [PATCH 36/42] Disabled multithreading of the kc loop --- frame/3/gemm/bli_gemm_threading.c | 3 ++- frame/3/herk/bli_herk_threading.c | 3 ++- frame/3/trmm/bli_trmm_threading.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 7a81e8e04..93c146ef9 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -110,7 +110,8 @@ void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num ) gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) { dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); - dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 7c5a6e141..00141ac68 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -110,7 +110,8 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num ) herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) { dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); - dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index ff9e6723c..7d6d89cff 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -111,7 +111,8 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num ) trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) { dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); - dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); +// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); + dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); From 45957cc7745e9bb1698408d72f53ef192e960820 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 13 May 2014 17:14:46 -0500 Subject: [PATCH 37/42] Allowed threading to be turned off No longer requires OpenMP to compile Define the following in bli_config.h in order to enable multithreading: BLIS_ENABLE_MULTITHREADING BLIS_ENABLE_OPENMP Also fixes a bug with bli_get_range_weighted --- frame/3/gemm/bli_gemm_threading.c | 12 +++++- frame/3/herk/bli_herk_threading.c | 9 +++++ frame/3/trmm/bli_trmm_threading.c | 8 ++++ frame/base/bli_mem.c | 11 ++++-- frame/base/bli_threading.c | 64 ++++++++++++++++++++++++++++++- 5 files changed, 98 insertions(+), 6 deletions(-) diff --git a/frame/3/gemm/bli_gemm_threading.c b/frame/3/gemm/bli_gemm_threading.c index 93c146ef9..3056f019f 100644 --- a/frame/3/gemm/bli_gemm_threading.c +++ b/frame/3/gemm/bli_gemm_threading.c @@ -86,7 +86,7 @@ gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_ void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread) { - if( thread == NULL ) return; + if( thread == NULL || thread == &BLIS_GEMM_SINGLE_THREADED ) return; // Free Communicators if( thread_am_ochief( thread ) ) @@ -109,12 +109,22 @@ void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num ) gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( ) { + +#ifdef BLIS_ENABLE_MULTITHREADING dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); // dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif + dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 00141ac68..b0def6f3f 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -109,12 +109,21 @@ void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num ) herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) { + +#ifdef BLIS_ENABLE_MULTITHREADING dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); // dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 7d6d89cff..7c675bbb7 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -110,12 +110,20 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num ) trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) { +#ifdef BLIS_ENABLE_MULTITHREADING dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); // dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); +#else + dim_t jc_way = 1; + dim_t kc_way = 1; + dim_t ic_way = 1; + dim_t jr_way = 1; + dim_t ir_way = 1; +#endif dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; assert( global_num_threads != 0 ); diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 6df0361d4..06688cacf 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -127,7 +127,10 @@ void bli_mem_acquire_m( siz_t req_size, // BEGIN CRITICAL SECTION - _Pragma( "omp critical (mem)" ){ +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif + { // Query the index of the contiguous memory block that resides at the // "top" of the pool. @@ -199,8 +202,10 @@ void bli_mem_release( mem_t* mem ) // BEGIN CRITICAL SECTION - _Pragma( "omp critical (mem)" ){ - +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif + { // Increment the top of the memory pool. bli_pool_inc_top_index( pool ); diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index b2c9ae29c..5f16b34fe 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -105,6 +105,7 @@ void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) void tree_barrier( barrier_t* barack ) { +#ifdef BLIS_ENABLE_OPENMP int my_signal = barack->signal; int my_count; @@ -122,6 +123,9 @@ void tree_barrier( barrier_t* barack ) volatile int* listener = &barack->signal; while( *listener == my_signal ) {} } +#else + return +#endif } void bli_barrier( thread_comm_t* comm, dim_t t_id ) @@ -130,6 +134,7 @@ void bli_barrier( thread_comm_t* comm, dim_t t_id ) } #else + void bli_cleanup_communicator( thread_comm_t* communicator ) { if( communicator == NULL ) return; @@ -145,6 +150,7 @@ void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads) //barrier routine taken from art of multicore programming or something void bli_barrier( thread_comm_t* communicator, dim_t t_id ) { +#ifdef BLIS_ENABLE_OPENMP if(communicator == NULL || communicator->n_threads == 1) return; bool_t my_sense = communicator->barrier_sense; @@ -161,6 +167,9 @@ void bli_barrier( thread_comm_t* communicator, dim_t t_id ) volatile bool_t* listener = &communicator->barrier_sense; while( *listener == my_sense ) {} } +#else + return; +#endif } #endif @@ -226,6 +235,45 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto *end = bli_min( *start + n_pt, size + all_start ); } +void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* out_start, dim_t* out_end) +{ + //bli_get_range( thr, all_start, all_end, block_factor, out_start, out_end ); + //return; + + thrinfo_t* thread = (thrinfo_t*) thr; + dim_t n_way = thread->n_way; + dim_t work_id = thread->work_id; + + dim_t size = all_end - all_start; + dim_t start = all_start; + dim_t end = all_end; + + if( !forward ) { + work_id = n_way - work_id - 1; + } + + dim_t curr_caucus = n_way - 1; + dim_t len = 0; + dim_t num = size*size / n_way; // 2xArea per thread? + while(1){ + dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus + width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); + if( curr_caucus == work_id ) { + if( end > width ) + start = bli_max(end - width, start); + break; + } + else{ + end -= width; + len += width; + curr_caucus--; + } + } + + *out_start = start; + *out_end = end; +} +/* void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) { thrinfo_t* thread = (thrinfo_t*) thr; @@ -257,11 +305,12 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl } else{ dim_t len = *end - *start; - dim_t num = len * len / n_way; + dim_t num = size*size / n_way; while(1){ dim_t width = sqrt(*start * *start + num) - *start; width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); - if(!work_id) { + + if( work_id == 0 ) { *end = bli_min( *start + width, *end ); return; } @@ -272,6 +321,7 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl } } } +*/ void bli_level3_thread_decorator( dim_t n_threads, level3_int_t func, @@ -283,6 +333,7 @@ void bli_level3_thread_decorator( dim_t n_threads, void* cntl, void** thread ) { +#ifdef BLIS_ENABLE_OPENMP _Pragma( "omp parallel num_threads(n_threads)" ) { dim_t omp_id = omp_get_thread_num(); @@ -295,6 +346,15 @@ void bli_level3_thread_decorator( dim_t n_threads, cntl, thread[omp_id] ); } +#else + func( alpha, + a, + b, + beta, + c, + cntl, + thread[0] ); +#endif } dim_t bli_read_nway_from_env( char* env ) From 13a4c717ed0e273359dbaf5554cc4fa70b087d71 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 14 May 2014 14:59:04 -0500 Subject: [PATCH 38/42] Fixed bug with bli_get_range_weighted --- frame/base/bli_threading.c | 51 +++++++------------------------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index 5f16b34fe..eb0b0251a 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -235,45 +235,6 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto *end = bli_min( *start + n_pt, size + all_start ); } -void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* out_start, dim_t* out_end) -{ - //bli_get_range( thr, all_start, all_end, block_factor, out_start, out_end ); - //return; - - thrinfo_t* thread = (thrinfo_t*) thr; - dim_t n_way = thread->n_way; - dim_t work_id = thread->work_id; - - dim_t size = all_end - all_start; - dim_t start = all_start; - dim_t end = all_end; - - if( !forward ) { - work_id = n_way - work_id - 1; - } - - dim_t curr_caucus = n_way - 1; - dim_t len = 0; - dim_t num = size*size / n_way; // 2xArea per thread? - while(1){ - dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus - width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); - if( curr_caucus == work_id ) { - if( end > width ) - start = bli_max(end - width, start); - break; - } - else{ - end -= width; - len += width; - curr_caucus--; - } - } - - *out_start = start; - *out_end = end; -} -/* void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) { thrinfo_t* thread = (thrinfo_t*) thr; @@ -281,8 +242,8 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl dim_t work_id = thread->work_id; dim_t size = all_end - all_start; - *start = all_start; - *end = all_end; + *start = 0; + *end = all_end - all_start; if( forward ) { dim_t curr_caucus = n_way - 1; @@ -294,6 +255,9 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl if( curr_caucus == work_id ) { if( *end > width ) *start = *end - width; + + *start = *start + all_start; + *end = *end + all_start; return; } else{ @@ -304,6 +268,7 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl } } else{ + dim_t len = *end - *start; dim_t num = size*size / n_way; while(1){ @@ -312,6 +277,9 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl if( work_id == 0 ) { *end = bli_min( *start + width, *end ); + + *start = *start + all_start; + *end = *end + all_start; return; } else{ @@ -321,7 +289,6 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl } } } -*/ void bli_level3_thread_decorator( dim_t n_threads, level3_int_t func, From 5c048a90d8dfa1dbde4e45fbc10ffcbdfe59d960 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 14 May 2014 16:20:06 -0500 Subject: [PATCH 39/42] Disabled parallelism for right-sided TRMM JC loop The loop has dependent iterations. --- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/bli_trmm_threading.c | 14 +++++++++++--- frame/3/trmm/bli_trmm_threading.h | 2 +- frame/3/trmm3/bli_trmm3_front.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index d8caba7dc..dea8b7771 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -125,7 +125,7 @@ void bli_trmm_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( !bli_is_left( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 7c675bbb7..fd9fe8e44 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -108,15 +108,23 @@ void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num ) bli_free( threads ); } -trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ) +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ) { #ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); // dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" ); dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); - dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + + if( !jc_dependency ){ + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); + } + else + { + dim_t jc_way = 1; + dim_t jr_way = bli_read_nway_from_env( "BLIS_JC_NT" ) * bli_read_nway_from_env( "BLIS_JR_NT" ); + } #else dim_t jc_way = 1; dim_t kc_way = 1; diff --git a/frame/3/trmm/bli_trmm_threading.h b/frame/3/trmm/bli_trmm_threading.h index 3b4ebd743..dadc65912 100644 --- a/frame/3/trmm/bli_trmm_threading.h +++ b/frame/3/trmm/bli_trmm_threading.h @@ -58,7 +58,7 @@ typedef struct trmm_thrinfo_s trmm_thrinfo_t; #define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) #define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( ); +trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ); void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads ); void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread, diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 6f8757faa..16c41154d 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,7 +127,7 @@ void bli_trmm3_front( side_t side, if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; - trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths(); + trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. From 0b4b1680334528b1b60bc696537600f763198e92 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 16 May 2014 12:23:37 -0500 Subject: [PATCH 40/42] Fixed bug with disabling JC loop threading for right sided trmm --- frame/3/trmm/bli_trmm_threading.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index fd9fe8e44..03dca3a10 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -115,15 +115,12 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ) dim_t kc_way = 1; dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" ); dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" ); + dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); + dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); - if( !jc_dependency ){ - dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" ); - dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" ); - } - else - { - dim_t jc_way = 1; - dim_t jr_way = bli_read_nway_from_env( "BLIS_JC_NT" ) * bli_read_nway_from_env( "BLIS_JR_NT" ); + if( jc_dependency ){ + jr_way *= jc_way; + jc_way = 1; } #else dim_t jc_way = 1; From 8a0ef0e0db5880730425926f8ba56b457a2ba764 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Fri, 16 May 2014 13:44:14 -0500 Subject: [PATCH 41/42] Fixed rounding error in bli_get_range_weighted --- frame/base/bli_threading.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c index eb0b0251a..ca6503b8e 100644 --- a/frame/base/bli_threading.c +++ b/frame/base/bli_threading.c @@ -250,13 +250,10 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl dim_t len = 0; dim_t num = size*size / n_way; // 2xArea per thread? while(1){ - dim_t width = sqrt( len*len + num ) - len; // The width of the current caucus + dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); if( curr_caucus == work_id ) { - if( *end > width ) - *start = *end - width; - - *start = *start + all_start; + *start = bli_max( 0 , *end - width ) + all_start; *end = *end + all_start; return; } @@ -268,18 +265,14 @@ void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t bl } } else{ - - dim_t len = *end - *start; dim_t num = size*size / n_way; while(1){ - dim_t width = sqrt(*start * *start + num) - *start; + dim_t width = ceil(sqrt(*start * *start + num)) - *start; width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); if( work_id == 0 ) { - *end = bli_min( *start + width, *end ); - *start = *start + all_start; - *end = *end + all_start; + *end = bli_min( *start + width, all_end ); return; } else{ From 21fb089387ee7c87f6dc53b0f60f68b48d3ff3e8 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Mon, 19 May 2014 20:38:55 -0700 Subject: [PATCH 42/42] Reverting changes dunnington and reference configs Now they are unchanged from the main branch of BLIS --- config/dunnington/bli_config.h | 4 ++-- config/dunnington/make_defs.mk | 4 ++-- config/reference/bli_config.h | 2 +- config/reference/make_defs.mk | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/dunnington/bli_config.h b/config/dunnington/bli_config.h index 22fc0a412..b397f3c94 100644 --- a/config/dunnington/bli_config.h +++ b/config/dunnington/bli_config.h @@ -69,7 +69,7 @@ // -- MULTITHREADING ----------------------------------------------------------- // The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 24 +#define BLIS_MAX_NUM_THREADS 1 @@ -80,7 +80,7 @@ // The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the // contiguous memory pools. #define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 4 +#define BLIS_NUM_KC_X_NC_BLOCKS 1 #define BLIS_NUM_MC_X_NC_BLOCKS 0 // The maximum preload byte offset is used to pad the end of the contiguous diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index b8af82d5d..a1f741d8e 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -80,7 +80,7 @@ CC := gcc # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -fopenmp #-pg +CMISCFLAGS := -std=c99 # -fopenmp -pg CDBGFLAGS := #-g CWARNFLAGS := -Wall COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer @@ -100,7 +100,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -lm -fopenmp +LDFLAGS := -lm diff --git a/config/reference/bli_config.h b/config/reference/bli_config.h index 2078a080d..f6be2e573 100644 --- a/config/reference/bli_config.h +++ b/config/reference/bli_config.h @@ -69,7 +69,7 @@ // -- MULTITHREADING ----------------------------------------------------------- // The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 2 +#define BLIS_MAX_NUM_THREADS 1 diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index a1e884808..ab2b5a462 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -76,14 +76,14 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- -CC := gcc-4.8 +CC := gcc # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 -fopenmp # -pg +CMISCFLAGS := -std=c99 # -fopenmp -pg CDBGFLAGS := -g CWARNFLAGS := -Wall -COPTFLAGS := -O0 -g +COPTFLAGS := -O2 CKOPTFLAGS := $(COPTFLAGS) CVECFLAGS := #-msse3 -march=native # -mfpmath=sse @@ -100,7 +100,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -fopenmp -lm +LDFLAGS := -lm